コード例 #1
0
def makeTestSet(file_type):
    makeDirOrdie(test_dir, False)
    test_files = {}
    for entry in test_data.keys():
        sequence_names = test_data[entry].split(",")
        test_files[entry] = makeTestFile("testSet/%s.%s" % (entry, file_type), file_type, sequence_names)
    return test_files
コード例 #2
0
ファイル: arms.py プロジェクト: mahdi-b/ARMS
def preprocessData(args, pool=Pool(processes=1)):
   # TODO: test run.name is a single word


   makeDirOrdie(args.outDir) 



   printVerbose("Preprocessing the data:")

   # *****************************************************************************************
   printVerbose("\t Renaming sequences")
   # "~/programs/fastx/bin/fastx_renamer -n COUNT -i %s %s"
   rename_outFile_f = os.path.join("outDir/", os.path.basename(args.input_f)+"_renamed")
   rename_outFile_r = os.path.join("outDir/", os.path.basename(args.input_r)+"_renamed")

   pool.map(runInstance, [ProgramRunner("fastx_renamer",[args.input_f, rename_outFile_f], {"exists":[args.input_f]}),
                          ProgramRunner("fastx_renamer",[args.input_r, rename_outFile_r], {"exists":[args.input_r]}),
                          ])
   printVerbose("\tRenamed X sequences")
   # *****************************************************************************************
   # Making the contigs using Pear
   # "~/programs/pear-0.9.4-bin-64/pear-0.9.4-64 -f %s -r %s -o %s -j %s"
   assembledPrefix = os.path.join("outDir", args.name)
   pool.map(runInstance, [ProgramRunner("pear", 
                                           (rename_outFile_f, rename_outFile_r, assembledPrefix, args.threads), 
                                           {"exists":[rename_outFile_f, rename_outFile_r]}) 
                          ])
   assembledFastqFile = os.path.join("outDir", args.name+".assembled.fastq")
   # add py char to a web-page
   printVerbose("\t %s sequences assembled, %s contigs discarded, %s sequences discarded" % (1,1,1))


   # *****************************************************************************************
   # Converting fastq to fasta file (do with mothur or BioSeqIO to keep prog deps to a minimum)
   pool.map(runInstance, [ProgramRunner("fastq.info", 
                                           [assembledFastqFile], 
                                           {"exists": [assembledFastqFile]}) 
                          ])
   assembledFastaFile = os.path.splitext(assembledFastqFile)[0]+".fasta"
   # TODO: add py char to a web-page
   printVerbose("\t converted fastq to fasta")
   # *****************************************************************************************
   # Trimming and assigning reads to groups
   # trim.seqs(fasta=%, oligos=%s, maxambig=0, maxhomop=8, minlength=300, maxlength=550, bdiffs=1, pdiffs=2)

   pool.map(runInstance, [ProgramRunner("trim.seqs",
                                        [assembledFastaFile, args.barcodes],
                                        {"exists": [assembledFastaFile]})
                          ])
   printVerbose("\t %s sequences were assigned to groups and %s sequences were discareded")
   trimmedFasaFile = os.path.splitext(assembledFastqFile)[0]+".trim.fasta"
   # *****************************************************************************************
   # Aligning against the BIOCODETEMPLATE database
   pool.map(runInstance, [ProgramRunner("align.seqs",
                                        [trimmedFasaFile, args.db],
                                        {"exists": [trimmedFasaFile]})
                          ])
   printVerbose("\t %s sequences were assigned to groups and %s sequences were discareded")
コード例 #3
0
ファイル: test_utils.py プロジェクト: pythseq/Chewbacca
def makeTestSet(file_type):
    makeDirOrdie(test_dir, False)
    test_files = {}
    for entry in test_data.keys():
        sequence_names = test_data[entry].split(",")
        test_files[entry] = makeTestFile("testSet/%s.%s" % (entry, file_type),
                                         file_type, sequence_names)
    return test_files
コード例 #4
0
    def rename_chewbacca(self, input_f, outdir, filetype, clip, processes):
        """Renames sequences in a fasta/fastq file as <filename>_ID0, <filename>_ID1, <filename>_ID2, etc., where
            <filename> is the name of the fasta/fastq file without any extensions or chewbacca suffixes.

        :param input_f: Filepath to an input file or folder to rename.
        :param outdir: Filepath to the output directory.
        :param filetype: Either 'fasta' or 'fastq'.
        :param clip: If True, remove dereplication counts from sequence names before renaming.
        :param processes: The maximum number of processes to use.
        """

        # Gather input files
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "rename")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Renaming sequences...")
        # Run serialRename in run_parallel
        run_parallel([PythonRunner(serialRename,
                                   [input_,
                                    "%s/%s_renamed%s" % (outdir, strip_ixes(input_), os.path.splitext(input_)[1]),
                                    filetype, clip], {"exists": [input_]})
                      for input_ in inputs], pool)
        printVerbose("Done renaming sequences...")

        samples_dir = makeDirOrdie("%s_samples" % outdir)
        samples_files = getInputFiles(outdir, "*.samples", ignore_empty_files=False)
        bulk_move_to_dir(samples_files, samples_dir)

        aux_dir = makeAuxDir(outdir)
        aux_files = getInputFiles(outdir, "*.mapping", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)

        cleanup_pool(pool)
コード例 #5
0
def test_assemble():
    """
    Test the Assemble_Command to ensure that: \n
    1. command creates just one output file in outdir. \n
    2. output file is correct (1 assembled read). \n
    3. command creates aux dir.

    parser_assemble.add_argument('-f', '--input_f', required=True, help="Forward Fastq Reads file or folder.")
    parser_assemble.add_argument('-r', '--input_r', required=True, help="Reverse Fastq Reads file or folder.")
    parser_assemble.add_argument('-n', '--name', required=True, help="Assembled File Prefix.")
    parser_assemble.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.")
    """

    params = lambda: 0
    test_reads = "assemble_data/test_%s.fq"
    params.input_f = test_reads % "R1"
    params.input_r = test_reads % "R2"
    params.outdir = "rslt"
    params.name = "test"
    params.processes = 1
    params.pearthreads = 1
    params.extraargstring = ""

    for program in Assemble_Command.supported_programs:
        cleanup_files(params.outdir)
        makeDirOrdie(params.outdir)
        params.program = program.name
        Assemble_Command(params).execute_command()
        assert_outdir(params.outdir)
        output_files = getInputFiles(params.outdir, "*assembled*")
        assert_equals(len(output_files), 1)
        assert_auxdir(params.outdir)
        seqs = fasta_to_list(output_files[0], 'fastq')
        assert_equals(len(seqs), 1)
        for seq in seqs:
            assert_true("good" in seq.id)
    return True
コード例 #6
0
def test_assemble():
    """
    Test the Assemble_Command to ensure that: \n
    1. command creates just one output file in outdir. \n
    2. output file is correct (1 assembled read). \n
    3. command creates aux dir.

    parser_assemble.add_argument('-f', '--input_f', required=True, help="Forward Fastq Reads file or folder.")
    parser_assemble.add_argument('-r', '--input_r', required=True, help="Reverse Fastq Reads file or folder.")
    parser_assemble.add_argument('-n', '--name', required=True, help="Assembled File Prefix.")
    parser_assemble.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.")
    """

    params = lambda: 0
    test_reads = "assemble_data/test_%s.fq"
    params.input_f = test_reads % "R1"
    params.input_r = test_reads % "R2"
    params.outdir = "rslt"
    params.name = "test"
    params.processes = 1
    params.pearthreads = 1
    params.extraargstring = ""

    for program in Assemble_Command.supported_programs:
        cleanup_files(params.outdir)
        makeDirOrdie(params.outdir)
        params.program = program.name
        Assemble_Command(params).execute_command()
        assert_outdir(params.outdir)
        output_files = getInputFiles(params.outdir, "*assembled*")
        assert_equals(len(output_files), 1)
        assert_auxdir(params.outdir)
        seqs = fasta_to_list(output_files[0], 'fastq')
        assert_equals(len(seqs), 1)
        for seq in seqs:
            assert_true("good" in seq.id)
    return True
コード例 #7
0
    def rename_chewbacca(self, input_f, outdir, filetype, clip, processes):
        """Renames sequences in a fasta/fastq file as <filename>_ID0, <filename>_ID1, <filename>_ID2, etc., where
            <filename> is the name of the fasta/fastq file without any extensions or chewbacca suffixes.

        :param input_f: Filepath to an input file or folder to rename.
        :param outdir: Filepath to the output directory.
        :param filetype: Either 'fasta' or 'fastq'.
        :param clip: If True, remove dereplication counts from sequence names before renaming.
        :param processes: The maximum number of processes to use.
        """

        # Gather input files
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "rename")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Renaming sequences...")
        # Run serialRename in run_parallel
        run_parallel([
            PythonRunner(serialRename, [
                input_,
                "%s/%s_renamed%s" %
                (outdir, strip_ixes(input_), os.path.splitext(input_)[1]),
                filetype, clip
            ], {"exists": [input_]}) for input_ in inputs
        ], pool)
        printVerbose("Done renaming sequences...")

        samples_dir = makeDirOrdie("%s_samples" % outdir)
        samples_files = getInputFiles(outdir,
                                      "*.samples",
                                      ignore_empty_files=False)
        bulk_move_to_dir(samples_files, samples_dir)

        aux_dir = makeAuxDir(outdir)
        aux_files = getInputFiles(outdir,
                                  "*.mapping",
                                  ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)

        cleanup_pool(pool)
コード例 #8
0
    def dereplicate_vsearch(self, input_f, outdir, groupsfile, processes,
                            stripcounts, extraargstring):
        """Dereplicates with vsearch.

        :param input_f: Filepath to the file or folder of files to dereplicate.
        :param outdir: Filepath to the output directory.
        :param groupsfile: A groups file to use as a reference for replicant counting.  If no groups file is
                            provided, input sequences are conidered singletons (regardless of their name-annotated
                            dereplication count).
        :param processes: The number of processes to use to dereplicate the fileset.
        :param stripcounts: If True, strips the trailing dereplication counts from a file before dereplication.
        :param extraargstring: Advanced program parameter string.
        """
        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        # REMOVES COUNTS FROM SEQUENCE NAMES IN ORDER TO CLUSTER PROPERLY
        # strip counts if we need to.
        if stripcounts:
            printVerbose("Removing counts from sequence names...")
            debugPrintInputInfo(inputs, "renamed")
            run_parallel([
                PythonRunner(removeCountsFromFastFile, [
                    input_,
                    "%s/%s_uncount.fasta" %
                    (outdir, strip_ixes(input_)), 'fasta'
                ], {"exists": input_}) for input_ in inputs
            ], pool)
            printVerbose("Done removing counts.")

            # Grab the cleaned files as input for the next step
            inputs = getInputFiles(outdir, "*_uncount.fasta")

        # DEREPLICATE
        debugPrintInputInfo(inputs, "dereplicated")
        printVerbose("Dereplicating...")
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.DEREP_VSEARCH, [
                processes, input_,
                "%s/%s_derep.fasta" % (outdir, strip_ixes(input_)),
                "%s/%s_uc.out" % (outdir, strip_ixes(input_))
            ], {
                "exists": [input_],
                "positive": [processes]
            }, extraargstring) for input_ in inputs
        ], pool)
        printVerbose("Done dereplicating")

        # LOG DEREPLICATED SEQUENCES INTO A .GROUPS FILE
        # generates a .groups file named _uc_parsed.out
        # python parseUCtoGroups.py uc.out uc_parsed.out
        input_ucs = getInputFiles(outdir, "*_uc.out")
        printVerbose("Generating a groups file from dereplication.")
        debugPrintInputInfo(inputs, "parsed (into a .groups file)")
        run_parallel([
            PythonRunner(
                parseUCtoGroups,
                [input_,
                 "%s/%s_derep.groups" %
                 (outdir, strip_ixes(input_))], {"exists": [input_]})
            for input_ in input_ucs
        ], pool)

        most_recent_groups_files = getInputFiles(outdir,
                                                 "*_derep.groups",
                                                 ignore_empty_files=False)

        # UPDATE THE MOST CURRENT GROUPS FILES WITH DEREPLICATION COUNTS
        if groupsfile is not None:
            # Grab the oldgroups file and the dereplicated groups file
            old_groups_files = getInputFiles(groupsfile)
            derep_groups_files = getInputFiles(outdir, "*_derep.groups")

            printVerbose("Updating .groups files with dereplicated data")
            printVerbose("%d Reference (old) groups files to be read:" %
                         len(old_groups_files))
            printVerbose(str(old_groups_files))
            printVerbose("%d Dereplicated (new) groups files to be read:" %
                         len(derep_groups_files))
            printVerbose(str(derep_groups_files))

            update_groups(old_groups_files, derep_groups_files, outdir,
                          "dereplicated")
            most_recent_groups_files = getInputFiles(outdir,
                                                     "dereplicated*",
                                                     ignore_empty_files=False)
            printVerbose("Done updating .groups files.")

        if len(inputs) != len(most_recent_groups_files):
            print(
                "Error: Number of input fastas (%d) is not equal to the number ofgroups files (%d)."
                % (len(inputs), len(most_recent_groups_files)))
            exit()
        fasta_groups_pairs = zip(inputs, most_recent_groups_files)
        # ADD COUNT TO SEQUENCE NAMES AND SORT BY COUNT
        # python renameWithReplicantCounts.py
        #               8_macse_out/MACSEOUT_MERGED.fasta uc_parsed.out dereplicated_renamed.fasta
        printVerbose("Adding dereplication data to unique fasta")
        run_parallel([
            PythonRunner(renameWithReplicantCounts, [
                fasta, groups,
                "%s/%s_counts.fasta" % (outdir, strip_ixes(fasta)), 'fasta'
            ], {"exists": [fasta, groups]})
            for fasta, groups in fasta_groups_pairs
        ], pool)
        printVerbose("Done adding data")

        aux_dir = makeAuxDir(outdir)
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(most_recent_groups_files, groups_dir)
        aux_files = getInputFiles(outdir,
                                  '*',
                                  "*_counts.fasta",
                                  ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)
        cleanup_pool(pool)
コード例 #9
0
def main(argv):
    """Parses command line args, builds an argparse.ArgumentParser, and runs the chosen command.
        Otherwise, prints usage.

    :param argv: Command line arguments as a list of strings
    """
    parser = argparse.ArgumentParser(description="arms description", epilog="arms long description")
    parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + version)
    parser.add_argument("--verbose", dest='verbose', help="Increase output verbosity")
    parser.add_argument('-t', '--processes', type=int, default=1, help="The maximum number of processes to spawn.")
    parser.add_argument('--dryrun', dest='dryrrun', default=False, action='store_true', help="Return command line \
                           commands without validation or execution.")
    parser.add_argument('--debugtest', default=False)
    parser.add_argument('-y', '--extraargstring', default="", required=False, help="Auxillary parameters you wish to \
                            pass to the called program (such as options chewbacca doesn't support).  USE AT YOUR OWN \
                            RISK.")

    subparsers = parser.add_subparsers(dest='action', help='Available commands')

    # ====================================
    # ==  Fix reads with Baye's hammer  ==
    # ====================================
    # "SPADES_PRECLEAN":  --only-error-correction -o %s -1 %s -2 %s"
    parser_preclean = subparsers.add_parser('preclean', description="Given a pair of left and right fasta/fastq reads, \
                            or a pair of folder containing the left and right fasta/fastq files, fixes short errors \
                            in the reads using Baye's Hammer.  Forwards reads filegroups should end in \
                            '_forward.<ext>' or '_R1.<ext>'.  Reverse reads filegroups should end in '_reverse.<ext>' \
                            or '_R2.<ext>' (where <ext> is the file extension).")
    parser_preclean.add_argument('-f', '--input_f', required=True, help="Forward Fastq Reads file or folder.")
    parser_preclean.add_argument('-r', '--input_r', required=True, help="Reverse Fastq Reads file or folder.")
    parser_preclean.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.")
    parser_preclean.add_argument('-p', '--program', required=False, default="bayeshammer", help="Indicates which \
                            program to use.  Choices are: 'bayeshammer'.  Default: 'bayeshammer'.")
    parser_preclean.add_argument('-j', '--bayesthreads', type=int, required=False, default=1, help="The number of \
                            threads to use per process (default is 1")

    parser_preclean.set_defaults(command=Preclean_Command)

    # =================================
    # ==  Assemble Reads using pear  ==
    # =================================
    # "pear": programPaths["PEAR"] + " -f \"%s\" -r \"%s\" -o \"%s\" -j %d "
    parser_assemble = subparsers.add_parser('assemble', description="Given a pair of left and right fasta/fastq reads, \
                            or a pair of folder containing the left and right fasta/fastq files, assembles the left \
                            and right read files into contiguous sequence files.  Forwards reads filegroups should \
                            end in '_forward.<ext>' or '_R1.<ext>'.  Reverse reads filegroups should end in \
                            '_reverse.<ext>' or '_R2.<ext>'.  (where <ext> is the file extension).")
    parser_assemble.add_argument('-f', '--input_f', required=True, help="Forward Fastq Reads file or folder.")
    parser_assemble.add_argument('-r', '--input_r', required=True, help="Reverse Fastq Reads file or folder.")
    parser_assemble.add_argument('-n', '--name', required=True, help="Assembled File Prefix.")
    parser_assemble.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.")
    parser_assemble.add_argument('-p', '--program', required=False, default="pear", help="Indicates which \
                            program to use.  Choices are: 'pear'.  Default: 'pear'.")
    parser_assemble.add_argument('-j', '--pearthreads', type=int, required=False, default=1, help="Pear: The number of \
                            threads to use per process (default is 1")
    parser_assemble.set_defaults(command=Assemble_Command)

    # ====================================
    # ==   Split by barcode with FastX  ==
    # ====================================
    # "barcode.splitter": "cat \"%s\" | " + programPaths["FASTX"] + "fastx_barcode_splitter.pl  --bcfile \"%s\" \
    #                                    -prefix \"%s\" --suffix .fastq --bol --mismatches 1",
    parser_demux_barcode = subparsers.add_parser('demux_barcode', description="Given a single barcodes file, and a \
                            fasta/fastq file or a folder containing fasta/fastq files, splits each input fasta/fastq \
                            file into separate sample files (based on each sequence's barcode), for each barcode in \
                            the barcodes file.  Sample files of size zero are ignored.")
    parser_demux_barcode.add_argument('-i', '--input_f', required=True, help="Input fasta/fastq file or folder.")
    parser_demux_barcode.add_argument('-b', '--barcodes', required=True,
                              help="Tab delimted files of barcodes and corresponding sample names.")
    parser_demux_barcode.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.")
    parser_demux_barcode.add_argument('-p', '--program', required=False, default="fastx", help="Indicates which \
                            program to use.  Choices are: 'fastx'.  Default: 'fastx'.")
    parser_demux_barcode.set_defaults(command=Demux_Barcode_Command)


    # =====================================
    # ==   Split by name with Chewbacca  ==
    # =====================================
    # "barcode.splitter": "cat \"%s\" | " + programPaths["FASTX"] + "fastx_barcode_splitter.pl  --bcfile \"%s\" \
    #                                    -prefix \"%s\" --suffix .fastq --bol --mismatches 1",
    parser_demux_name = subparsers.add_parser('demux_name', description="Given a barcodes file listing sequence names \
                            and 'fake' barcode sequences, and a \
                            fasta/fastq file or a folder containing fasta/fastq files, splits each input fasta/fastq \
                            file into separate sample files (based on a sample-keyword in each sequence's name), \
                            for each sample name in the barcodes file.  Sample files of size zero are ignored.")
    parser_demux_name.add_argument('-i', '--input_f', required=True, help="Input fasta/fastq file or folder.")
    parser_demux_name.add_argument('-b', '--barcodes', required=True,
                              help="Tab delimted files of barcodes and corresponding sample names.  Barcode sequences \
                              are ignored and can be faked.  Only sample names will be read.")
    parser_demux_name.add_argument('-f', '--filetype', required=True, help="Indicates input and output filetype.  \
                            Either 'fasta' or 'fastq'.")
    parser_demux_name.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.")
    parser_demux_name.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \
                            program to use.  Choices are: 'chewbacca'.  Default: 'chewbacca'.")
    parser_demux_name.set_defaults(command=Demux_Name_Command)



    # ==================================================
    # ==  Rename reads serially with renameSequences  ==
    # ==================================================
    # renameSequences(input, output)
    parser_rename = subparsers.add_parser('rename', description="Given a fasta/fastq file or directory of fasta/fastq \
                            files, serially renames each sequence in each file, with the filename as a \
                            prefix.  e.g. the sequences in abc.fasta are renamed abc_ID0, abc_ID1, abc_ID2, etc.")
    parser_rename.add_argument('-i', '--input_f', required=True, help="Input fasta/fastq file or folder.")
    parser_rename.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.")
    parser_rename.add_argument('-f', '--filetype', required=True, help="The filetype of the input files.  Either \
                            'fasta' or 'fastq'.")
    parser_rename.add_argument('-c', '--clip', required=False, default=True, help="Set True if input file groups \
                            contain trailing identifiers from the demux_seqs step.  e.g. True if file names contain \
                            '_0', '_1', '_2', etc..  Default: True.")
    parser_rename.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \
                            program to use.  Choices are: 'chewbacca'.  Default: 'chewbacca'.")
    parser_rename.set_defaults(command=Rename_Command)

    # =================================================
    # ==  Trims barcodes and adapters using flexbar  ==
    # =================================================
    # "flexbar":  "flexbar -r \"%s\" -t \"%s\" -ae \"%s\" -a \"%s\""
    parser_trim = subparsers.add_parser('trim_adapters', description="Given a single adapters file, a single \
                            adaptersrc file, and a fasta/fastq file or a folder containing fasta/fastq files, removes \
                            the specified adapters (and preceeding barcodes) from all sequences in the given \
                            fasta/fastq files.")
    parser_trim.add_argument('-i', '--input_f', required=True, help="Input fasta/fastq file or folder.")
    parser_trim.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.")
    parser_trim.add_argument('-a', '--adapters', required=True, help="Forwards Adapters file.")
    parser_trim.add_argument('-arc', '--adaptersrc', required=True, help="Reverse Complemented Adapters file.")
    parser_trim.add_argument('-p', '--program', required=False, default="flexbar", help="Indicates which \
                            program to use.  Choices are: 'flexbar'.  Default: 'flexbar'.")
    parser_trim.add_argument('-u', '--allowedns', required=False, default=0, type=int, help=" Flexbar: The number of \
                            unknown 'N' bases a sequence is allowed before being thrown out.  Default: 0.")
    parser_trim.set_defaults(command=Clean_Adapters_Command)

    # ====================================
    # ==  Clean Reads with Low Quality  ==
    # ====================================
    # Clean low-quality reads with trimmomatic
    # "trimomatic":       "java -jar ~/ARMS/programs/Trimmomatic-0.33/trimmomatic-0.33.jar SE \
    # -phred33 input output_cleaned.fastq SLIDINGWINDOW:%windowsize:%minAvgQuality MINLEN:%minLen"
    parser_trimmomatic = subparsers.add_parser('clean_seqs', description="Given a single fastq file or a folder \
                            containing fastq files, trims away areas of low read quality (specified by -q) in each \
                            sequence.  Then, the longest remaining segment (of at least length -m) is recorded as the \
                            new sequence.")
    parser_trimmomatic.add_argument('-i', '--input_f', required=True, help="Input fastq file or folder")
    parser_trimmomatic.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved")
    parser_trimmomatic.add_argument('-p', '--program', required=False, default="trimmomatic", help="Indicates which \
                            program to use.  Choices are: 'trimmomatic'.  Default: 'trimmomatic'.")
    parser_trimmomatic.add_argument('-m', '--minlen', type=int, default=200,
                                    help="Trimmomatic: Minimum length for cleaned sequences")
    parser_trimmomatic.add_argument('-w', '--windowSize', type=int, default=5,
                                    help="Trimmomatic: Size of the sliding window")
    parser_trimmomatic.add_argument('-q', '--quality', type=int, default=25,
                                    help="Trimmomatic: Minimum average quality for items in the sliding window")
    parser_trimmomatic.set_defaults(command=Clean_Quality_Command)

    # ==========================================
    # ==  Dereplicate sequences with vsearch  ==
    # ==========================================
    # " vsearch --threads %d --derep_fulllength %s --sizeout --fasta_width 0 --output %s -uc %s",
    parser_derep = subparsers.add_parser('dereplicate_fasta', description="Given an fasta file or folder, removes \
                            identical duplicates and subsequences WITHIN EACH FILE, keeping the longest sequence, and \
                            regroups it with the number of duplicates as '<longest_sequence_name>_<duplicate count>'.")
    parser_derep.add_argument('-i', '--input_f', required=True, help="Input fasta file or folder of fasta files.")
    parser_derep.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.")
    parser_derep.add_argument('-j', '--threads', required=False, type=int, default=2, help="Number of threads to use \
                            per query process.")
    parser_derep.add_argument('-g', '--groupsfile', required=False, help="A .groups file to update.  If no .groups \
                            file is provided, then sequences are assumed to be singletons.")
    parser_derep.add_argument('-p', '--program', required=False, default="vsearch", help="Indicates which \
                            program to use.  Choices are: 'vsearch'.  Default: 'vsearch'.")
    parser_derep.add_argument('-s', '--stripcounts', required=False, type=bool, default=False, help="If included, \
                            strip counts from sequence groups before clustering.  This allows for the recognition of \
                            sequence groups that are annotated with dereplication counts.")
    parser_derep.set_defaults(command=Dereplicate_Command)

    # ============================================
    # ==  Partition fastas with splitKperFasta  ==
    # ============================================
    # splitK(inputFasta, prefix, nbSeqsPerFile, filetype):
    parser_split = subparsers.add_parser('partition', description="Given a fasta/fastq file, or folder containing \
                            fasta/fastq files, splits each file into a set of sub files labeled \
                            <filename>_part_x.<ext>, where each subfile contains at most <--chunksize> sequences. \
                            (Where <ext> is the file extension of the original file.")
    parser_split.add_argument('-i', '--input_f', required=True, help="Input fasta/fastq file or folder.")
    parser_split.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.")
    parser_split.add_argument('-c', '--chunksize', type=int, required=True, help="Chunksize. (Max number of sequences \
                            per file).")
    parser_split.add_argument('-f', '--filetype', required=True, help="The filetype of the input files.  Either \
                            'fasta' or 'fastq'.")
    parser_split.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \
                            program to use.  Choices are: 'chewbacca'.  Default: 'chewbacca'.")
    parser_split.set_defaults(command=Partition_Command)

    # =======================
    # ==  Merge the files  ==
    # =======================
    # merge_files(input_file_list, output_file)
    parser_cat = subparsers.add_parser('merge_files', description="Given a folder containing any type of file, \
                            concatenates all files in that folder together, regardless of their filetype.  Note: Any \
                            file headers/footers will be written as well.")
    parser_cat.add_argument('-i', '--input_f', required=True, help="Input directory containing files to concatenate.")
    parser_cat.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.")
    parser_cat.add_argument('-n', '--name', required=True, help="Prefix name for the merged file.")
    parser_cat.add_argument('-f', '--fileext', required=True, help="File extension for the output file.")
    parser_cat.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \
                            program to use.  Choices are: 'chewbacca'.  Default: 'chewbacca'.")
    parser_cat.set_defaults(command=Merge_Command)

    # =======================
    # ==  ungap the files  ==
    # =======================
    # ungap(file_to_clean, output_file_name, gap_char, file_type)
    parser_ungap = subparsers.add_parser('ungap_fasta', description="Given a fasta/fastq file or a folder containing \
                            fasta/fastq files, removes alignment characters (specified by -g).")
    parser_ungap.add_argument('-i', '--input_f', required=True, help="Input directory containing files to concatenate.")
    parser_ungap.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.")
    parser_ungap.add_argument('-f', '--fileext', required=True, help="File extension for the output file.  Either \
                            'fasta', or 'fastq'.")
    parser_ungap.add_argument('-g', '--gapchars', required=True, help="A string of one or more characters to remove \
                            from the sequences (but not sequence groups) in the input files.")
    parser_ungap.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \
                            program to use.  Choices are: 'chewbacca'.  Default: 'chewbacca'.")
    parser_ungap.set_defaults(command=Ungap_Command)

    # ==============================
    # ==  Align Reads with MACSE  ==
    # ==============================
    # "macse_align":      "java -jar " + programPaths["MACSE"] + " -prog enrichAlignment  -seq \"%s\" -align \
    #                                \"%s\" -seq_lr \"%s\" -maxFS_inSeq 0  -maxSTOP_inSeq 0  -maxINS_inSeq 0 \
    #                                -maxDEL_inSeq 3 -gc_def 5 -fs_lr -10 -stop_lr -10 -out_NT \"%s\"_NT \
    #                                -out_AA \"%s\"_AA -seqToAdd_logFile \"%s\"_log.csv",
    parser_align = subparsers.add_parser('macseAlign', description="Aligns sequences using the 'enrichAlignment' \
                            program in Macse.  Removes chimeras, invalid sequences, and orients sequences in the \
                            forward read direction.")
    parser_align.add_argument('-i', '--input_f', required=True, help="Input fasta")
    parser_align.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved")
    parser_align.add_argument('-d', '--db', required=True, help="Database against which to align and filter reads")
    parser_align.add_argument('-p', '--program', required=False, default="macse", help="Indicates which \
                            program to use.  Choices are: 'macse'.  Default: 'macse'.")
    parser_align.set_defaults(command=Clean_Deep_Command)

    # =====================================
    # == Clean Aligned Reads with MACSE  ==
    # =====================================
    # "macse_format":     "java -jar " + programPaths["MACSE"] + "  -prog exportAlignment -align \"%s\" \
    #                                -charForRemainingFS - -gc_def 5 -out_AA \"%s\" -out_NT \"%s\" -statFile \"%s\"",
    parser_macse_clean = subparsers.add_parser('macseClean', description="Removes gap characters inserted by the macse \
                            aligner.")
    parser_macse_clean.add_argument('-i', '--input_f', required=True, help="Input fasta")
    parser_macse_clean.add_argument('-s', '--samplesdir', required=True, help="Samples dir")
    parser_macse_clean.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved")
    parser_macse_clean.add_argument('-d', '--db', required=True, help="Database against which to align and filter \
                            reads")
    parser_macse_clean.add_argument('-p', '--program', required=False, default="macse", help="Indicates which \
                             program to use.  Choices are: 'macse'.  Default: 'macse'.")
    parser_macse_clean.set_defaults(command=Clean_Deep_Repair_Command)

    # ============================================
    # ==  Cluster using CROP, SWARM, OR VSEARCH ==
    # ============================================
    parser_cluster = subparsers.add_parser('cluster_seqs', description="Given a fasta file or a folder containing \
                            fasta files, performs clustering on each input file individually.  Outputs a .groups file \
                            listing unique representative sequences from each cluster, and the groups of the\
                            sequences they represent.  Also outputs a <input_file_name>_seeds.fasta file of the unique \
                            representatives.")
    parser_cluster.add_argument('-i', '--input_f', required=True, help="Input fasta file/folder")
    parser_cluster.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved")
    parser_cluster.add_argument('-p', '--program', required=False, default="swarm", help="One of 'crop', 'swarm', or \
                            'vsearch' to indicate which clustering program to use.  Default: 'swarm'.")
    parser_cluster.add_argument('-g', '--groupsfile', required=False, help="A .groups file to update.")
    parser_cluster.add_argument('-s', '--stripcounts', required=False, type=bool, default=True, help="If True, strip \
                            counts from sequence groups before clustering.  This allows for the recognition of \
                            group names in cases where dereplication counts have been added to group names.")
    # CROP options
    parser_cluster.add_argument('-z', '--blocksize', required=False, type=int, default=500, help="CROP only: Size of \
                            blocks to be used for all rounds (if -b is specified, then -z will not affect the first \
                            round.  For data set with different average sequence length, this parameter should be \
                            tuned such that it won't take too long for each block to do pariwise alignment.  Hint for \
                            choosing z: z*L<150,000, where L is the average length of the sequences.  Default: 500.")
    parser_cluster.add_argument('-b', '--blockcount', required=False, type=int, help="CROP only: The size of blocks in \
                            the first round of clustering. Hint of choosing -b: Each block in the first round should \
                            contain about 50 sequences.  i.e. b=N/50, where N is the number of input sequences.  \
                            Default: # input sequences / z.")
    parser_cluster.add_argument('-e', '--maxmcmc', required=False, type=int, default=2000, help="CROP only: This \
                            parameter specifies the number of iterations of MCMC. Default value is 2000. Increase this \
                            value to enhance accuracy (recommended value is at least 10*block size).")
    parser_cluster.add_argument('-c', '--clustpct', required=False, default="g", help="CROP only: The minimum \
                            similarity threshold for clustering.  Either 'g' for 95%% or 's' for 97%%.  Default: 'g'.")
    parser_cluster.add_argument('-m', '--maxsm', required=False, type=int, default=20, help="CROP only: This parameter \
                            specifies the maximum number of 'split and merge' process to run. Default value is 20, \
                            which is also the maximum allowed.")
    parser_cluster.add_argument('-r', '--rare', required=False, type=int, default=2, help="CROP only: The maximum \
                            cluster size allowed to be classified as 'rare'. Clusters are defined as either \
                            'abundant' or 'rare'. 'Abundant' clusters will be clustered first, then the 'rare' \
                            clusters are mapped to the 'abundant' clusters.  Finally, 'rare' clusters which cannot be \
                            mapped will be clustered separately. e.g. If r=5, the clusters with size <=5 will be \
                            considered 'rare' in above procedure. and r=0 will yield the best accuracy. If you \
                            believe your data is not too diverse to be handled, then r=0 will be the best choice. \
                            Default: 2.")
    # Vsearch options
    parser_cluster.add_argument('-v', '--idpct', required=False, type=float, default=.95, help="VSEARCH only: %% match \
                            required for clustering.  Real number in the range (0,1]. Default: 0.95")
    parser_cluster.set_defaults(command=Cluster_Command)

    # ================================================
    # ==  Closed Ref Picking with fasta references  ==
    # ================================================
    # --usearch_global  ../9_p_uchime/seeds.pick.fasta  --db ../data/BiocodePASSED_SAP.txt --id 0.9 \
    # --userfields query+target+id+alnlen+qcov --userout out  --alnout alnout.txt
    parser_query_fasta = subparsers.add_parser('query_fasta', description="Given a fasta file, or folder containing \
                            fasta files, aligns and identifies OTUs against a curated fasta file.")
    parser_query_fasta.add_argument('-i', '--input_f', required=True, help="Input file/folder with fasta files")
    parser_query_fasta.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved")
    parser_query_fasta.add_argument('-r', '--referencefasta', required=True, help="Filepath to the curated fasta file \
                            to use as a reference.")
    parser_query_fasta.add_argument('-x', '--taxinfo', required=True, help="Filepath to a two-column, tab-delimited \
                            file mapping a sequence's fasta id (in the referencefasta file) to a taxonomic \
                            identification.")
    parser_query_fasta.add_argument('-s', '--simmilarity', required=False, default=0.97, type=float, help="Minimum %%  \
                            simmilarity (decimal between 0 and 1) between query and reference sequences required for \
                            positive identification. Default: .97")
    parser_query_fasta.add_argument('-p', '--program', required=False, default="vsearch", help="Indicates which \
                            program to use.  Choices are: 'vsearch'.  Default: 'vsearch'.")
    parser_query_fasta.add_argument('-c', '--coverage', required=False, default=0.85, type=float, help="Minimum %% coverage \
                            (decimal between 0 and 1) required query and reference sequences required for positive \
                            identification. Default: .85")
    parser_query_fasta.add_argument('-j', '--threads', required=False, type=int, default=2,
                                    help="Number of threads to use per query process.")
    parser_query_fasta.set_defaults(command=Query_OTU_Fasta_Command)

    # ====================================
    # ==  Closed Ref Picking with NCBI  ==
    # ====================================
    # --usearch_global ../9_p_uchime/seeds.pick.fasta  --db /home/mahdi/refs/COI_DOWNLOADED/COI.fasta -id 0.9 \
    #          --userfields query+target+id+alnlen+qcov --userout out --alnout alnout.txt
    # --userfields query+target+id+alnlen+qcov
    parser_query_db = subparsers.add_parser('query_db', description="Given a fasta file, or folder containing \
                            fasta files, aligns and identifies OTUs against a curated database.")
    parser_query_db.add_argument('-i', '--input_f', required=True, help="Input Fasta File to clean")
    parser_query_db.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved")
    parser_query_db.add_argument('-r', '--referencefasta', required=True, help="Filepath to the curated fasta file \
                            to use as a reference.")
    parser_query_db.add_argument('-d', '--db', required=True, help="Filepath to the curated fasta file \
                            to use as a reference.")
    parser_query_db.add_argument('-s', '--simmilarity', required=False, default=0.97, type=float, help="Minimum %%  \
                                simmilarity (decimal between 0 and 1) between query and reference sequences \
                                required for positive identification. Default: .97")
    parser_query_db.add_argument('-c', '--coverage', required=False, default=0.85, type=float, help="Minimum %% coverage \
                                (decimal between 0 and 1) required query and reference sequences required for \
                                positive identification. Default: .85")
    parser_query_db.add_argument('-j', '--threads', required=False, type=int, default=2,
                                 help="Number of threads to use per query process.")
    parser_query_db.add_argument('-p', '--program', required=False, default="vsearch", help="Indicates which \
                            program to use.  Choices are: 'vsearch'.  Default: 'vsearch'.")
    parser_query_db.set_defaults(command=Query_OTU_DB_Command)

    # =======================
    # ==  Build OTU table  ==
    # =======================
    parser_build_matrix = subparsers.add_parser('build_matrix', description="Given a single barcodes file with all possible \
                            sample groups, a list of the latest groups file(s), and a list of initial samples files \
                            (mapping each original, undereplicated sequence to its sample name), builds an OTU \
                            table..  ")
    parser_build_matrix.add_argument('-s', '--samples', required=True, help="Input samples file or folder.")
    parser_build_matrix.add_argument('-g', '--groups', required=True, help="Input groups file or folder.")
    parser_build_matrix.add_argument('-b', '--barcodes', required=True, help="Input barcodes file.")
    parser_build_matrix.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.")
    parser_build_matrix.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \
                            program to use.  Choices are: 'chewbacca'.  Default: 'chewbacca'.")
    parser_build_matrix.set_defaults(command=Build_OTU_Table_Command)

    # =======================
    # ==  Annotate Matrix  ==
    # =======================
    parser_annotate_matrix = subparsers.add_parser('annotate_matrix', description="Given a tabular file mapping \
                            sequence IDs to taxonomic groups, and an OTU matrix, regroups the identifiable \
                            sequence IDs with taxonomic groups.")
    parser_annotate_matrix.add_argument('-i', '--input_f', required=True,
                                        help="Input matrix file or folder of matrix files.")
    parser_annotate_matrix.add_argument('-a', '--annotation', required=True,
                                        help="File mapping sequence IDs to taxonomic groups.")
    parser_annotate_matrix.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.")
    parser_annotate_matrix.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \
                            program to use.  Choices are: 'chewbacca'.  Default: 'chewbacca'.")
    parser_annotate_matrix.set_defaults(command=Annotate_OTU_Table_Command)

    # ==============================
    # ==  Convert fastq to fasta  ==
    # ==============================
    # translateFastqToFasta(inputFastQ, outputFasta):
    parser_to_fasta = subparsers.add_parser('convert_fastq_to_fasta', description="Converts a fastq file to a fasta \
                            file.")
    parser_to_fasta.add_argument('-i', '--input_f', required=True, help="Fastq file or folder containing fastq files to \
                            translate")
    parser_to_fasta.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.")
    parser_to_fasta.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \
                            program to use.  Choices are: 'chewbacca'.  Default: 'chewbacca'.")
    parser_to_fasta.set_defaults(command=Convert_Fastq_Fasta_Command)

    # ===========================================
    # == Visualize Samples as OTU Composition  ==
    # ===========================================
    parser_viz_otu_comp = subparsers.add_parser('visualize_otu_sample_composition', description="Creates a stacked \
                            barchart showing the OTU composition in each sample.")
    parser_viz_otu_comp.add_argument('-i', '--input_f', required=True, help="Filepath to the OTU table to visualize.  \
                            Input OTU tables should start with a tab-delimited header row of samplenames, prefixed by \
                            the word \"OTU\".  Each subsequent line should be a tab-delimited line listing an OTU, \
                            followed by its abundance in each Sample.\ne.g.:\n\
                            OTU <Sample_name1> <Sample_name2> <Sample_name3>...\n\
                            otu_name1 0 1 5...\n\
                            otu_name2 1 2 0...\n\
                            otu_name3 3 1 1...\n")
    parser_viz_otu_comp.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.")
    parser_viz_otu_comp.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \
                            program to use.  Choices are: 'chewbacca'.  Default: 'chewbacca'.")
    group = parser_viz_otu_comp.add_mutually_exclusive_group(required=False)
    group.add_argument('-m', '--pct', help="Real number x in the range (0,1] indicating that the top x%% of OTU names \
                           (sorted by abundance) should be included in the graph.", type=float)
    group.add_argument('-n', '--names', help="Filepath to a list of OTU names to include.  File should be formatted as \
                            a series of lines where each line contains just an OTU name.")
    group.add_argument('-k', '--count', help="Positive integer x indicating that x OTU names (sorted highest \
                            abundance) should be included in the graph.", type=int)
    group.set_defaults(command=Visualize_OTU_Sample_Composition_Command)

    # ===========================
    # == Visualize OTU Heatmap ==
    # ===========================
    parser_viz_otu_heatmap = subparsers.add_parser('visualize_otu_heatmap', description="Creates a heatmap showing the \
                            abundance of each OTU at each sample site.")
    parser_viz_otu_heatmap.add_argument('-i', '--input_f', required=True, help="Filepath to the OTU table to visualize.  \
                            Input OTU tables should start with a tab-delimited header row of samplenames, prefixed by \
                            the word \"OTU\".  Each subsequent line should be a tab-delimited line listing an OTU, \
                            followed by its abundance in each Sample.\ne.g.:\n\
                            OTU <Sample_name1> <Sample_name2> <Sample_name3>...\n\
                            otu_name1 0 1 5...\n\
                            otu_name2 1 2 0...\n\
                            otu_name3 3 1 1...\n")
    parser_viz_otu_heatmap.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.")
    parser_viz_otu_heatmap.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \
                            program to use.  Choices are: 'chewbacca'.  Default: 'chewbacca'.")
    group = parser_viz_otu_heatmap.add_mutually_exclusive_group(required=False)
    group.add_argument('-m', '--pct', help="Real number x in the range (0,1] indicating that the top x%% of OTU names \
                           (sorted by abundance) should be included in the graph.", type=float)
    group.add_argument('-n', '--names', help="Filepath to a list of OTU names to include.  File should be formatted as \
                            a series of lines where each line contains just an OTU name.")
    group.add_argument('-k', '--count', help="Positive integer x indicating that x OTU names (sorted highest \
                            abundance) should be included in the graph.", type=int)
    parser_viz_otu_heatmap.set_defaults(command=Visualize_OTU_Heatmap_Command)

    #=============================================================================================
    """
    # TEST
    from classes.ChewbaccaCommand import ChewbaccaCommand
    from classes.ChewbaccaProgram import ChewbaccaProgram
    from classes.ProgramRunner import ProgramRunner, ProgramRunnerCommands

    class Test_Program_Chewbacca(ChewbaccaProgram):
        name = "test"
        def execute_program(self):
            args = self.args
            p = ProgramRunner(ProgramRunnerCommands.TEST_ECHO, [args.input_f])
            p.run()

    class Test_Command(ChewbaccaCommand):
        default_program = Test_Program_Chewbacca
        supported_programs = [Test_Program_Chewbacca]
        command_name = "Test"

    test_parser = subparsers.add_parser('test', description="test.")
    test_parser.add_argument('-i', '--input_f', required=True, help="File")
    test_parser.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.")
    test_parser.set_defaults(command=Test_Command)
    """
    # =============================================================================================

    # =======================================
    # == Parse args and call default func  ==
    # =======================================
    args, unknown = parser.parse_known_args()
    if unknown:
        print "\nIgnoring unknown args: " + ', '.join(['%s'] * len(unknown)) % tuple(unknown)

    if args.verbose:
        logging.basicConfig(format=FORMAT, level=logging.DEBUG, datefmt=DATEFMT)
    else:
        logging.basicConfig(format=FORMAT, level=logging.ERROR, datefmt=DATEFMT)

    printVerbose.VERBOSE = (args.verbose is not None)
    logging.debug("Initial ARGS are: %s", args)
    print("\t\t")
    signal.signal(signal.SIGTSTP, signal.SIG_IGN)
    makeDirOrdie(args.outdir)
    args.command(args).execute_command()
コード例 #10
0
    def cluster_crop(self, input_f, outdir, groupsfile, processes, blocksize, clustpct, maxmcmc, maxsm, rare,
                     blockcount, extraargstring):
        """Clusters sequences using CROP.

        :param input_f: Filepath to the input fasta file to cluster.
        :param outdir: Filepath to the output directory.
        :param groupsfile: Filepath to the groups file to use as a reference for dereplication counting.
        :param blocksize: Size of blocks to be used for all rounds (if -b is specified, then -z will not affect the
                            first round.  For data set with different average sequence length, this parameter should \
                            be tuned such that it won't take too long for each block to do pariwise alignment.  Hint \
                            for choosing z: z*L<150,000, where L is the average length of the sequences.
        :param clustpct: The minimum similarity threshold for clustering.  Either 'g' for 95% or 's' for 97%.
        :param maxmcmc: This parameter specifies the number of iterations of MCMC. Default value is 2000. Increase \
                            this value to enhance accuracy (recommended value is at least 10*block size).
        :param maxsm: This parameter specifies the maximum number of 'split and merge' process to run.  Max is 20.
        :param rare: The maximum cluster size allowed to be classified as 'rare'. Clusters are defined as either \
                            'abundant' or 'rare'. 'Abundant' clusters will be clustered first, then the 'rare' \
                            clusters are mapped to the 'abundant' clusters.  Finally, 'rare' clusters which cannot be \
                            mapped will be clustered separately. e.g. If r=5, the clusters with size <=5 will be \
                            considered 'rare' in above procedure. and r=0 will yield the best accuracy. If you \
                            believe your data is not too diverse to be handled, then r=0 will be the best choice.
        :param blockcount: The size of blocks in the first round of clustering. Hint of choosing -b: Each block in the \
                            first round should contain about 50 sequences.  i.e. b=N/50, where N is the number of \
                            input sequences.  Default: # input sequences / z.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """

        # Grab the fasta file(s) to cluster
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clustered")
        pool = init_pool(min(len(inputs), processes))

        # RUN CLUSTERING
        # crop -i %s -o %s -z %s -c %s -e %s -m %s%s
        run_parallel([ProgramRunner(ProgramRunnerCommands.CLUSTER_CROP,
                                    [input_, "%s/%s" % (outdir, strip_ixes(input_)), blocksize, clustpct,
                                        maxmcmc, maxsm, rare, blockcount],
                                    {"exists": [input_]}, extraargstring) for input_ in inputs], pool)

        # CLEAN THE OUTPUT GROUPS FILE
        printVerbose("Parsing the groups file from clustering")
        clustered_groups_files = getInputFiles(outdir, "*.cluster.list")
        debugPrintInputInfo(clustered_groups_files, "converted to groups files")
        run_parallel([PythonRunner(parseCROPoutToGroups, [input_,
                                   "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))],
                                   {"exists": [input_]})
                      for input_ in clustered_groups_files], pool)
        printVerbose("Done parsing groups file.")

        # Collect the groups file from clustering with counts removed
        cleaned_clustered_groups_files = getInputFiles(outdir, "*_uncount.groups", ignore_empty_files=False)

        # Resolve the user specified names file if necessary
        final_groups_files = handle_groups_file_update(outdir, groupsfile, cleaned_clustered_groups_files)

        # GATHER AUX FILES
        input_dir = getDirName(input_f)
        aux_files = cleaned_clustered_groups_files
        aux_files += getInputFiles(input_dir, "*.unique", ignore_empty_files=False)
        aux_files += getInputFiles(input_dir, "*.unique.list", ignore_empty_files=False)
        aux_files += getInputFiles(input_dir, "*.unique.TempCenters.Rare", ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.cluster", ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.cluster.list", ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.log", ignore_empty_files=False)
        aux_files += getInputFiles(".", "LikelihoodRatio.txt", ignore_empty_files=False)

        # Move the final groups file(s) to the groups dir
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(final_groups_files, groups_dir)

        # Move aux files to the aux dir
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Cleanup the pool
        cleanup_pool(pool)
コード例 #11
0
def preprocessData(args, pool=Pool(processes=1)):
    # TODO: test run.name is a single word

    makeDirOrdie(args.outDir)

    printVerbose("Preprocessing the data:")

    # *****************************************************************************************
    printVerbose("\t Renaming sequences")
    # "~/programs/fastx/bin/fastx_renamer -n COUNT -i %s %s"
    rename_outFile_f = os.path.join(
        "outDir/",
        os.path.basename(args.input_f) + "_renamed")
    rename_outFile_r = os.path.join(
        "outDir/",
        os.path.basename(args.input_r) + "_renamed")

    pool.map(runInstance, [
        ProgramRunner("fastx_renamer", [args.input_f, rename_outFile_f],
                      {"exists": [args.input_f]}),
        ProgramRunner("fastx_renamer", [args.input_r, rename_outFile_r],
                      {"exists": [args.input_r]}),
    ])
    printVerbose("\tRenamed X sequences")
    # *****************************************************************************************
    # Making the contigs using Pear
    # "~/programs/pear-0.9.4-bin-64/pear-0.9.4-64 -f %s -r %s -o %s -j %s"
    assembledPrefix = os.path.join("outDir", args.name)
    pool.map(runInstance, [
        ProgramRunner("pear", (rename_outFile_f, rename_outFile_r,
                               assembledPrefix, args.threads),
                      {"exists": [rename_outFile_f, rename_outFile_r]})
    ])
    assembledFastqFile = os.path.join("outDir", args.name + ".assembled.fastq")
    # add py char to a web-page
    printVerbose(
        "\t %s sequences assembled, %s contigs discarded, %s sequences discarded"
        % (1, 1, 1))

    # *****************************************************************************************
    # Converting fastq to fasta file (do with mothur or BioSeqIO to keep prog deps to a minimum)
    pool.map(runInstance, [
        ProgramRunner("fastq.info", [assembledFastqFile],
                      {"exists": [assembledFastqFile]})
    ])
    assembledFastaFile = os.path.splitext(assembledFastqFile)[0] + ".fasta"
    # TODO: add py char to a web-page
    printVerbose("\t converted fastq to fasta")
    # *****************************************************************************************
    # Trimming and assigning reads to groups
    # trim.seqs(fasta=%, oligos=%s, maxambig=0, maxhomop=8, minlength=300, maxlength=550, bdiffs=1, pdiffs=2)

    pool.map(runInstance, [
        ProgramRunner("trim.seqs", [assembledFastaFile, args.barcodes],
                      {"exists": [assembledFastaFile]})
    ])
    printVerbose(
        "\t %s sequences were assigned to groups and %s sequences were discareded"
    )
    trimmedFasaFile = os.path.splitext(assembledFastqFile)[0] + ".trim.fasta"
    # *****************************************************************************************
    # Aligning against the BIOCODETEMPLATE database
    pool.map(runInstance, [
        ProgramRunner("align.seqs", [trimmedFasaFile, args.db],
                      {"exists": [trimmedFasaFile]})
    ])
    printVerbose(
        "\t %s sequences were assigned to groups and %s sequences were discareded"
    )
コード例 #12
0
    def dereplicate_vsearch(self, input_f, outdir, groupsfile, processes, stripcounts, extraargstring):
        """Dereplicates with vsearch.

        :param input_f: Filepath to the file or folder of files to dereplicate.
        :param outdir: Filepath to the output directory.
        :param groupsfile: A groups file to use as a reference for replicant counting.  If no groups file is
                            provided, input sequences are conidered singletons (regardless of their name-annotated
                            dereplication count).
        :param processes: The number of processes to use to dereplicate the fileset.
        :param stripcounts: If True, strips the trailing dereplication counts from a file before dereplication.
        :param extraargstring: Advanced program parameter string.
        """
        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        # REMOVES COUNTS FROM SEQUENCE NAMES IN ORDER TO CLUSTER PROPERLY
        # strip counts if we need to.
        if stripcounts:
            printVerbose("Removing counts from sequence names...")
            debugPrintInputInfo(inputs, "renamed")
            run_parallel([PythonRunner(removeCountsFromFastFile,
                                       [input_, "%s/%s_uncount.fasta" % (outdir, strip_ixes(input_)), 'fasta'],
                                       {"exists": input_})
                          for input_ in inputs], pool)
            printVerbose("Done removing counts.")

            # Grab the cleaned files as input for the next step
            inputs = getInputFiles(outdir, "*_uncount.fasta")

        # DEREPLICATE
        debugPrintInputInfo(inputs, "dereplicated")
        printVerbose("Dereplicating...")
        run_parallel([ProgramRunner(ProgramRunnerCommands.DEREP_VSEARCH,
                                    [processes, input_,
                                     "%s/%s_derep.fasta" % (outdir, strip_ixes(input_)),
                                     "%s/%s_uc.out" % (outdir, strip_ixes(input_))],
                                    {"exists": [input_], "positive": [processes]},
                                    extraargstring)
                      for input_ in inputs], pool)
        printVerbose("Done dereplicating")

        # LOG DEREPLICATED SEQUENCES INTO A .GROUPS FILE
        # generates a .groups file named _uc_parsed.out
        # python parseUCtoGroups.py uc.out uc_parsed.out
        input_ucs = getInputFiles(outdir, "*_uc.out")
        printVerbose("Generating a groups file from dereplication.")
        debugPrintInputInfo(inputs, "parsed (into a .groups file)")
        run_parallel([PythonRunner(parseUCtoGroups, [input_, "%s/%s_derep.groups" % (outdir, strip_ixes(input_))],
                                   {"exists": [input_]})
                      for input_ in input_ucs], pool)

        most_recent_groups_files = getInputFiles(outdir, "*_derep.groups", ignore_empty_files=False)

        # UPDATE THE MOST CURRENT GROUPS FILES WITH DEREPLICATION COUNTS
        if groupsfile is not None:
            # Grab the oldgroups file and the dereplicated groups file
            old_groups_files = getInputFiles(groupsfile)
            derep_groups_files = getInputFiles(outdir, "*_derep.groups")

            printVerbose("Updating .groups files with dereplicated data")
            printVerbose("%d Reference (old) groups files to be read:" % len(old_groups_files))
            printVerbose(str(old_groups_files))
            printVerbose("%d Dereplicated (new) groups files to be read:" % len(derep_groups_files))
            printVerbose(str(derep_groups_files))

            update_groups(old_groups_files, derep_groups_files, outdir, "dereplicated")
            most_recent_groups_files = getInputFiles(outdir, "dereplicated*", ignore_empty_files=False)
            printVerbose("Done updating .groups files.")

        if len(inputs) != len(most_recent_groups_files):
            print ("Error: Number of input fastas (%d) is not equal to the number ofgroups files (%d)." %
                   (len(inputs), len(most_recent_groups_files)))
            exit()
        fasta_groups_pairs = zip(inputs, most_recent_groups_files)
        # ADD COUNT TO SEQUENCE NAMES AND SORT BY COUNT
        # python renameWithReplicantCounts.py
        #               8_macse_out/MACSEOUT_MERGED.fasta uc_parsed.out dereplicated_renamed.fasta
        printVerbose("Adding dereplication data to unique fasta")
        run_parallel([PythonRunner(renameWithReplicantCounts,
                                   [fasta, groups, "%s/%s_counts.fasta" % (outdir, strip_ixes(fasta)), 'fasta'],
                                   {"exists": [fasta, groups]})
                      for fasta, groups in fasta_groups_pairs], pool)
        printVerbose("Done adding data")

        aux_dir = makeAuxDir(outdir)
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(most_recent_groups_files, groups_dir)
        aux_files = getInputFiles(outdir, '*', "*_counts.fasta", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)
        cleanup_pool(pool)
コード例 #13
0
    def cluster_crop(self, input_f, outdir, groupsfile, processes, blocksize,
                     clustpct, maxmcmc, maxsm, rare, blockcount,
                     extraargstring):
        """Clusters sequences using CROP.

        :param input_f: Filepath to the input fasta file to cluster.
        :param outdir: Filepath to the output directory.
        :param groupsfile: Filepath to the groups file to use as a reference for dereplication counting.
        :param blocksize: Size of blocks to be used for all rounds (if -b is specified, then -z will not affect the
                            first round.  For data set with different average sequence length, this parameter should \
                            be tuned such that it won't take too long for each block to do pariwise alignment.  Hint \
                            for choosing z: z*L<150,000, where L is the average length of the sequences.
        :param clustpct: The minimum similarity threshold for clustering.  Either 'g' for 95% or 's' for 97%.
        :param maxmcmc: This parameter specifies the number of iterations of MCMC. Default value is 2000. Increase \
                            this value to enhance accuracy (recommended value is at least 10*block size).
        :param maxsm: This parameter specifies the maximum number of 'split and merge' process to run.  Max is 20.
        :param rare: The maximum cluster size allowed to be classified as 'rare'. Clusters are defined as either \
                            'abundant' or 'rare'. 'Abundant' clusters will be clustered first, then the 'rare' \
                            clusters are mapped to the 'abundant' clusters.  Finally, 'rare' clusters which cannot be \
                            mapped will be clustered separately. e.g. If r=5, the clusters with size <=5 will be \
                            considered 'rare' in above procedure. and r=0 will yield the best accuracy. If you \
                            believe your data is not too diverse to be handled, then r=0 will be the best choice.
        :param blockcount: The size of blocks in the first round of clustering. Hint of choosing -b: Each block in the \
                            first round should contain about 50 sequences.  i.e. b=N/50, where N is the number of \
                            input sequences.  Default: # input sequences / z.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """

        # Grab the fasta file(s) to cluster
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clustered")
        pool = init_pool(min(len(inputs), processes))

        # RUN CLUSTERING
        # crop -i %s -o %s -z %s -c %s -e %s -m %s%s
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.CLUSTER_CROP, [
                input_,
                "%s/%s" % (outdir, strip_ixes(input_)), blocksize, clustpct,
                maxmcmc, maxsm, rare, blockcount
            ], {"exists": [input_]}, extraargstring) for input_ in inputs
        ], pool)

        # CLEAN THE OUTPUT GROUPS FILE
        printVerbose("Parsing the groups file from clustering")
        clustered_groups_files = getInputFiles(outdir, "*.cluster.list")
        debugPrintInputInfo(clustered_groups_files,
                            "converted to groups files")
        run_parallel([
            PythonRunner(parseCROPoutToGroups, [
                input_,
                "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))
            ], {"exists": [input_]}) for input_ in clustered_groups_files
        ], pool)
        printVerbose("Done parsing groups file.")

        # Collect the groups file from clustering with counts removed
        cleaned_clustered_groups_files = getInputFiles(
            outdir, "*_uncount.groups", ignore_empty_files=False)

        # Resolve the user specified names file if necessary
        final_groups_files = handle_groups_file_update(
            outdir, groupsfile, cleaned_clustered_groups_files)

        # GATHER AUX FILES
        input_dir = getDirName(input_f)
        aux_files = cleaned_clustered_groups_files
        aux_files += getInputFiles(input_dir,
                                   "*.unique",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(input_dir,
                                   "*.unique.list",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(input_dir,
                                   "*.unique.TempCenters.Rare",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(outdir,
                                   "*.cluster",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(outdir,
                                   "*.cluster.list",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.log", ignore_empty_files=False)
        aux_files += getInputFiles(".",
                                   "LikelihoodRatio.txt",
                                   ignore_empty_files=False)

        # Move the final groups file(s) to the groups dir
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(final_groups_files, groups_dir)

        # Move aux files to the aux dir
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Cleanup the pool
        cleanup_pool(pool)
コード例 #14
0
    def cluster_swarm(self, input_f, outdir, groupsfile, processes,
                      extraargstring):
        """Clusters sequences using SWARM.
        :param input_f: A file or folder containing fasta files to cluster.
        :param outdir: The output directory results will be written to.
        :param groupsfile: A groups file or folder containing groups files that describe the input. Note: if no groups
                            file is supplied, then entries in the fasta file are assumed to be singleton sequences.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        # Grab the fasta file(s) to cluster
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clustered")
        pool = init_pool(min(len(inputs), processes))

        # RUN CLUSTERING
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.CLUSTER_SWARM, [
                input_,
                "%s/%s_clustered" % (outdir, strip_ixes(input_)),
                "%s/%s_clustered_uc" % (outdir, strip_ixes(input_)),
                "%s/%s_clustered_seeds" % (outdir, strip_ixes(input_))
            ], {"exists": [input_]}, extraargstring) for input_ in inputs
        ], pool)

        # PARSE UC FILE TO GROUPS FILE
        printVerbose("Parsing the clustered uc files to groups files")
        clustered_uc_files = getInputFiles(outdir, "*_clustered_uc")
        debugPrintInputInfo(clustered_uc_files, "parsed to groups")
        run_parallel([
            PythonRunner(
                parseUCtoGroups,
                [input_, "%s/%s.groups" %
                 (outdir, strip_ixes(input_))], {"exists": [input_]})
            for input_ in clustered_uc_files
        ], pool)
        printVerbose("Done parsing groups files.")

        # REMOVE COUNTS FROM CLUSTERING GROUPS FILE
        printVerbose("Cleaning the .groups file from clustering")
        # Grab the current groups file and the new clustered groups file (which needs to be cleaned)
        clustered_groups_files = getInputFiles(outdir, "*_clustered.groups")
        debugPrintInputInfo(clustered_groups_files, "cleaned")
        run_parallel([
            PythonRunner(removeCountsFromGroupsFile, [
                input_,
                "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))
            ], {"exists": [input_]}) for input_ in clustered_groups_files
        ], pool)
        printVerbose("Done cleaning groups files.")

        printVerbose("Capitalizing sequences")
        # Convert the seeds files to uppercase (swarm writes in lowercase)
        inputs = getInputFiles(outdir, "*_seeds")
        run_parallel([
            PythonRunner(capitalize_seqs, [input_, "%s.fasta" % input_],
                         {"exists": [input_]}) for input_ in inputs
        ], pool)
        printVerbose("Done capitalizing sequences")

        # Collect the groups file from clustering with counts removed
        cleaned_clustered_groups_files = getInputFiles(
            outdir, "*_uncount.groups", ignore_empty_files=False)

        # Resolve the user specified names file if necessary
        final_groups_files = handle_groups_file_update(
            outdir, groupsfile, cleaned_clustered_groups_files)

        # Move the final groups file(s) to the groups dir
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(final_groups_files, groups_dir)

        # Move aux files to the aux dir
        aux_files = getInputFiles(outdir,
                                  "*",
                                  "*_seeds.fasta",
                                  ignore_empty_files=False)
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Cleanup the pool
        cleanup_pool(pool)
コード例 #15
0
    def cluster_vsearch(self, input_f, outdir, groupsfile, processes, idpct, extraargstring):
        """Clusters sequences using SWARM.
        :param input_f: A file or folder containing fasta files to cluster.
        :param outdir: The output directory results will be written to.
        :param groupsfile: A groups file or folder containinggroups files that describe the input. Note: if no groups
                            file is supplied, then entries in the fasta file are assumed to be singleton sequences.
        :param idpct: Real number in the range (0,1] that specifies the minimum simmilarity threshold for
                            clustering.  e.g. .95 indicates that a candidate sequence 95% must be at least
                            95% simmilar to the seed sequence to be included in the cluster.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        # Grab the fasta file(s) to cluster
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clustered")
        pool = init_pool(min(len(inputs), processes))

        # RUN CLUSTERING
        # " --cluster_size %s -id %f --centroids %s  --uc %s",
        run_parallel([ProgramRunner(ProgramRunnerCommands.CLUSTER_VSEARCH,
                                    [input_, float(idpct), "%s/%s_seeds.fasta" % (outdir, strip_ixes(input_)),
                                     "%s/%s_clustered_uc" % (outdir, strip_ixes(input_))],
                                    {"exists": [input_]}, extraargstring) for input_ in inputs], pool)

        # PARSE UC FILE TO GROUPS FILE
        printVerbose("Parsing the clustered uc files to groups files")
        clustered_uc_files = getInputFiles(outdir, "*_clustered_uc")
        debugPrintInputInfo(clustered_uc_files, "parsed to groups")
        run_parallel([PythonRunner(parseUCtoGroups, [input_, "%s/%s.groups" % (outdir, strip_ixes(input_))],
                                   {"exists": [input_]})
                      for input_ in clustered_uc_files], pool)

        # REMOVE COUNTS FROM CLUSTERING GROUPS FILE
        printVerbose("Cleaning the .groups file from clustering")
        # Grab the current groups file and the new clustered groups file (which needs to be cleaned)
        clustered_groups_files = getInputFiles(outdir, "*_clustered.groups")
        # Remove counts from the clustering groups files
        debugPrintInputInfo(clustered_groups_files, "cleaned")
        run_parallel([PythonRunner(removeCountsFromGroupsFile,
                                   [input_, "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))],
                                   {"exists": [input_]})
                      for input_ in clustered_groups_files], pool)
        printVerbose("Done cleaning groups files.")

        # Collect the groups file from clustering with counts removed
        cleaned_clustered_groups_files = getInputFiles(outdir, "*_uncount.groups", ignore_empty_files=False)

        # Resolve the user specified names file if necessary
        final_groups_files = handle_groups_file_update(outdir, groupsfile, cleaned_clustered_groups_files)

        # Move the final groups file(s) to the groups dir
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(final_groups_files, groups_dir)

        # Move aux files to the aux dir
        aux_files = getInputFiles(outdir, "*", "*_seeds.fasta", ignore_empty_files=False)
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Cleanup the pool
        cleanup_pool(pool)