def makeTestSet(file_type): makeDirOrdie(test_dir, False) test_files = {} for entry in test_data.keys(): sequence_names = test_data[entry].split(",") test_files[entry] = makeTestFile("testSet/%s.%s" % (entry, file_type), file_type, sequence_names) return test_files
def preprocessData(args, pool=Pool(processes=1)): # TODO: test run.name is a single word makeDirOrdie(args.outDir) printVerbose("Preprocessing the data:") # ***************************************************************************************** printVerbose("\t Renaming sequences") # "~/programs/fastx/bin/fastx_renamer -n COUNT -i %s %s" rename_outFile_f = os.path.join("outDir/", os.path.basename(args.input_f)+"_renamed") rename_outFile_r = os.path.join("outDir/", os.path.basename(args.input_r)+"_renamed") pool.map(runInstance, [ProgramRunner("fastx_renamer",[args.input_f, rename_outFile_f], {"exists":[args.input_f]}), ProgramRunner("fastx_renamer",[args.input_r, rename_outFile_r], {"exists":[args.input_r]}), ]) printVerbose("\tRenamed X sequences") # ***************************************************************************************** # Making the contigs using Pear # "~/programs/pear-0.9.4-bin-64/pear-0.9.4-64 -f %s -r %s -o %s -j %s" assembledPrefix = os.path.join("outDir", args.name) pool.map(runInstance, [ProgramRunner("pear", (rename_outFile_f, rename_outFile_r, assembledPrefix, args.threads), {"exists":[rename_outFile_f, rename_outFile_r]}) ]) assembledFastqFile = os.path.join("outDir", args.name+".assembled.fastq") # add py char to a web-page printVerbose("\t %s sequences assembled, %s contigs discarded, %s sequences discarded" % (1,1,1)) # ***************************************************************************************** # Converting fastq to fasta file (do with mothur or BioSeqIO to keep prog deps to a minimum) pool.map(runInstance, [ProgramRunner("fastq.info", [assembledFastqFile], {"exists": [assembledFastqFile]}) ]) assembledFastaFile = os.path.splitext(assembledFastqFile)[0]+".fasta" # TODO: add py char to a web-page printVerbose("\t converted fastq to fasta") # ***************************************************************************************** # Trimming and assigning reads to groups # trim.seqs(fasta=%, oligos=%s, maxambig=0, maxhomop=8, minlength=300, maxlength=550, bdiffs=1, pdiffs=2) pool.map(runInstance, [ProgramRunner("trim.seqs", [assembledFastaFile, args.barcodes], {"exists": [assembledFastaFile]}) ]) printVerbose("\t %s sequences were assigned to groups and %s sequences were discareded") trimmedFasaFile = os.path.splitext(assembledFastqFile)[0]+".trim.fasta" # ***************************************************************************************** # Aligning against the BIOCODETEMPLATE database pool.map(runInstance, [ProgramRunner("align.seqs", [trimmedFasaFile, args.db], {"exists": [trimmedFasaFile]}) ]) printVerbose("\t %s sequences were assigned to groups and %s sequences were discareded")
def rename_chewbacca(self, input_f, outdir, filetype, clip, processes): """Renames sequences in a fasta/fastq file as <filename>_ID0, <filename>_ID1, <filename>_ID2, etc., where <filename> is the name of the fasta/fastq file without any extensions or chewbacca suffixes. :param input_f: Filepath to an input file or folder to rename. :param outdir: Filepath to the output directory. :param filetype: Either 'fasta' or 'fastq'. :param clip: If True, remove dereplication counts from sequence names before renaming. :param processes: The maximum number of processes to use. """ # Gather input files inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "rename") pool = init_pool(min(len(inputs), processes)) printVerbose("Renaming sequences...") # Run serialRename in run_parallel run_parallel([PythonRunner(serialRename, [input_, "%s/%s_renamed%s" % (outdir, strip_ixes(input_), os.path.splitext(input_)[1]), filetype, clip], {"exists": [input_]}) for input_ in inputs], pool) printVerbose("Done renaming sequences...") samples_dir = makeDirOrdie("%s_samples" % outdir) samples_files = getInputFiles(outdir, "*.samples", ignore_empty_files=False) bulk_move_to_dir(samples_files, samples_dir) aux_dir = makeAuxDir(outdir) aux_files = getInputFiles(outdir, "*.mapping", ignore_empty_files=False) bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)
def test_assemble(): """ Test the Assemble_Command to ensure that: \n 1. command creates just one output file in outdir. \n 2. output file is correct (1 assembled read). \n 3. command creates aux dir. parser_assemble.add_argument('-f', '--input_f', required=True, help="Forward Fastq Reads file or folder.") parser_assemble.add_argument('-r', '--input_r', required=True, help="Reverse Fastq Reads file or folder.") parser_assemble.add_argument('-n', '--name', required=True, help="Assembled File Prefix.") parser_assemble.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.") """ params = lambda: 0 test_reads = "assemble_data/test_%s.fq" params.input_f = test_reads % "R1" params.input_r = test_reads % "R2" params.outdir = "rslt" params.name = "test" params.processes = 1 params.pearthreads = 1 params.extraargstring = "" for program in Assemble_Command.supported_programs: cleanup_files(params.outdir) makeDirOrdie(params.outdir) params.program = program.name Assemble_Command(params).execute_command() assert_outdir(params.outdir) output_files = getInputFiles(params.outdir, "*assembled*") assert_equals(len(output_files), 1) assert_auxdir(params.outdir) seqs = fasta_to_list(output_files[0], 'fastq') assert_equals(len(seqs), 1) for seq in seqs: assert_true("good" in seq.id) return True
def rename_chewbacca(self, input_f, outdir, filetype, clip, processes): """Renames sequences in a fasta/fastq file as <filename>_ID0, <filename>_ID1, <filename>_ID2, etc., where <filename> is the name of the fasta/fastq file without any extensions or chewbacca suffixes. :param input_f: Filepath to an input file or folder to rename. :param outdir: Filepath to the output directory. :param filetype: Either 'fasta' or 'fastq'. :param clip: If True, remove dereplication counts from sequence names before renaming. :param processes: The maximum number of processes to use. """ # Gather input files inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "rename") pool = init_pool(min(len(inputs), processes)) printVerbose("Renaming sequences...") # Run serialRename in run_parallel run_parallel([ PythonRunner(serialRename, [ input_, "%s/%s_renamed%s" % (outdir, strip_ixes(input_), os.path.splitext(input_)[1]), filetype, clip ], {"exists": [input_]}) for input_ in inputs ], pool) printVerbose("Done renaming sequences...") samples_dir = makeDirOrdie("%s_samples" % outdir) samples_files = getInputFiles(outdir, "*.samples", ignore_empty_files=False) bulk_move_to_dir(samples_files, samples_dir) aux_dir = makeAuxDir(outdir) aux_files = getInputFiles(outdir, "*.mapping", ignore_empty_files=False) bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)
def dereplicate_vsearch(self, input_f, outdir, groupsfile, processes, stripcounts, extraargstring): """Dereplicates with vsearch. :param input_f: Filepath to the file or folder of files to dereplicate. :param outdir: Filepath to the output directory. :param groupsfile: A groups file to use as a reference for replicant counting. If no groups file is provided, input sequences are conidered singletons (regardless of their name-annotated dereplication count). :param processes: The number of processes to use to dereplicate the fileset. :param stripcounts: If True, strips the trailing dereplication counts from a file before dereplication. :param extraargstring: Advanced program parameter string. """ inputs = getInputFiles(input_f) pool = init_pool(min(len(inputs), processes)) # REMOVES COUNTS FROM SEQUENCE NAMES IN ORDER TO CLUSTER PROPERLY # strip counts if we need to. if stripcounts: printVerbose("Removing counts from sequence names...") debugPrintInputInfo(inputs, "renamed") run_parallel([ PythonRunner(removeCountsFromFastFile, [ input_, "%s/%s_uncount.fasta" % (outdir, strip_ixes(input_)), 'fasta' ], {"exists": input_}) for input_ in inputs ], pool) printVerbose("Done removing counts.") # Grab the cleaned files as input for the next step inputs = getInputFiles(outdir, "*_uncount.fasta") # DEREPLICATE debugPrintInputInfo(inputs, "dereplicated") printVerbose("Dereplicating...") run_parallel([ ProgramRunner(ProgramRunnerCommands.DEREP_VSEARCH, [ processes, input_, "%s/%s_derep.fasta" % (outdir, strip_ixes(input_)), "%s/%s_uc.out" % (outdir, strip_ixes(input_)) ], { "exists": [input_], "positive": [processes] }, extraargstring) for input_ in inputs ], pool) printVerbose("Done dereplicating") # LOG DEREPLICATED SEQUENCES INTO A .GROUPS FILE # generates a .groups file named _uc_parsed.out # python parseUCtoGroups.py uc.out uc_parsed.out input_ucs = getInputFiles(outdir, "*_uc.out") printVerbose("Generating a groups file from dereplication.") debugPrintInputInfo(inputs, "parsed (into a .groups file)") run_parallel([ PythonRunner( parseUCtoGroups, [input_, "%s/%s_derep.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in input_ucs ], pool) most_recent_groups_files = getInputFiles(outdir, "*_derep.groups", ignore_empty_files=False) # UPDATE THE MOST CURRENT GROUPS FILES WITH DEREPLICATION COUNTS if groupsfile is not None: # Grab the oldgroups file and the dereplicated groups file old_groups_files = getInputFiles(groupsfile) derep_groups_files = getInputFiles(outdir, "*_derep.groups") printVerbose("Updating .groups files with dereplicated data") printVerbose("%d Reference (old) groups files to be read:" % len(old_groups_files)) printVerbose(str(old_groups_files)) printVerbose("%d Dereplicated (new) groups files to be read:" % len(derep_groups_files)) printVerbose(str(derep_groups_files)) update_groups(old_groups_files, derep_groups_files, outdir, "dereplicated") most_recent_groups_files = getInputFiles(outdir, "dereplicated*", ignore_empty_files=False) printVerbose("Done updating .groups files.") if len(inputs) != len(most_recent_groups_files): print( "Error: Number of input fastas (%d) is not equal to the number ofgroups files (%d)." % (len(inputs), len(most_recent_groups_files))) exit() fasta_groups_pairs = zip(inputs, most_recent_groups_files) # ADD COUNT TO SEQUENCE NAMES AND SORT BY COUNT # python renameWithReplicantCounts.py # 8_macse_out/MACSEOUT_MERGED.fasta uc_parsed.out dereplicated_renamed.fasta printVerbose("Adding dereplication data to unique fasta") run_parallel([ PythonRunner(renameWithReplicantCounts, [ fasta, groups, "%s/%s_counts.fasta" % (outdir, strip_ixes(fasta)), 'fasta' ], {"exists": [fasta, groups]}) for fasta, groups in fasta_groups_pairs ], pool) printVerbose("Done adding data") aux_dir = makeAuxDir(outdir) groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(most_recent_groups_files, groups_dir) aux_files = getInputFiles(outdir, '*', "*_counts.fasta", ignore_empty_files=False) bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)
def main(argv): """Parses command line args, builds an argparse.ArgumentParser, and runs the chosen command. Otherwise, prints usage. :param argv: Command line arguments as a list of strings """ parser = argparse.ArgumentParser(description="arms description", epilog="arms long description") parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + version) parser.add_argument("--verbose", dest='verbose', help="Increase output verbosity") parser.add_argument('-t', '--processes', type=int, default=1, help="The maximum number of processes to spawn.") parser.add_argument('--dryrun', dest='dryrrun', default=False, action='store_true', help="Return command line \ commands without validation or execution.") parser.add_argument('--debugtest', default=False) parser.add_argument('-y', '--extraargstring', default="", required=False, help="Auxillary parameters you wish to \ pass to the called program (such as options chewbacca doesn't support). USE AT YOUR OWN \ RISK.") subparsers = parser.add_subparsers(dest='action', help='Available commands') # ==================================== # == Fix reads with Baye's hammer == # ==================================== # "SPADES_PRECLEAN": --only-error-correction -o %s -1 %s -2 %s" parser_preclean = subparsers.add_parser('preclean', description="Given a pair of left and right fasta/fastq reads, \ or a pair of folder containing the left and right fasta/fastq files, fixes short errors \ in the reads using Baye's Hammer. Forwards reads filegroups should end in \ '_forward.<ext>' or '_R1.<ext>'. Reverse reads filegroups should end in '_reverse.<ext>' \ or '_R2.<ext>' (where <ext> is the file extension).") parser_preclean.add_argument('-f', '--input_f', required=True, help="Forward Fastq Reads file or folder.") parser_preclean.add_argument('-r', '--input_r', required=True, help="Reverse Fastq Reads file or folder.") parser_preclean.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.") parser_preclean.add_argument('-p', '--program', required=False, default="bayeshammer", help="Indicates which \ program to use. Choices are: 'bayeshammer'. Default: 'bayeshammer'.") parser_preclean.add_argument('-j', '--bayesthreads', type=int, required=False, default=1, help="The number of \ threads to use per process (default is 1") parser_preclean.set_defaults(command=Preclean_Command) # ================================= # == Assemble Reads using pear == # ================================= # "pear": programPaths["PEAR"] + " -f \"%s\" -r \"%s\" -o \"%s\" -j %d " parser_assemble = subparsers.add_parser('assemble', description="Given a pair of left and right fasta/fastq reads, \ or a pair of folder containing the left and right fasta/fastq files, assembles the left \ and right read files into contiguous sequence files. Forwards reads filegroups should \ end in '_forward.<ext>' or '_R1.<ext>'. Reverse reads filegroups should end in \ '_reverse.<ext>' or '_R2.<ext>'. (where <ext> is the file extension).") parser_assemble.add_argument('-f', '--input_f', required=True, help="Forward Fastq Reads file or folder.") parser_assemble.add_argument('-r', '--input_r', required=True, help="Reverse Fastq Reads file or folder.") parser_assemble.add_argument('-n', '--name', required=True, help="Assembled File Prefix.") parser_assemble.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.") parser_assemble.add_argument('-p', '--program', required=False, default="pear", help="Indicates which \ program to use. Choices are: 'pear'. Default: 'pear'.") parser_assemble.add_argument('-j', '--pearthreads', type=int, required=False, default=1, help="Pear: The number of \ threads to use per process (default is 1") parser_assemble.set_defaults(command=Assemble_Command) # ==================================== # == Split by barcode with FastX == # ==================================== # "barcode.splitter": "cat \"%s\" | " + programPaths["FASTX"] + "fastx_barcode_splitter.pl --bcfile \"%s\" \ # -prefix \"%s\" --suffix .fastq --bol --mismatches 1", parser_demux_barcode = subparsers.add_parser('demux_barcode', description="Given a single barcodes file, and a \ fasta/fastq file or a folder containing fasta/fastq files, splits each input fasta/fastq \ file into separate sample files (based on each sequence's barcode), for each barcode in \ the barcodes file. Sample files of size zero are ignored.") parser_demux_barcode.add_argument('-i', '--input_f', required=True, help="Input fasta/fastq file or folder.") parser_demux_barcode.add_argument('-b', '--barcodes', required=True, help="Tab delimted files of barcodes and corresponding sample names.") parser_demux_barcode.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.") parser_demux_barcode.add_argument('-p', '--program', required=False, default="fastx", help="Indicates which \ program to use. Choices are: 'fastx'. Default: 'fastx'.") parser_demux_barcode.set_defaults(command=Demux_Barcode_Command) # ===================================== # == Split by name with Chewbacca == # ===================================== # "barcode.splitter": "cat \"%s\" | " + programPaths["FASTX"] + "fastx_barcode_splitter.pl --bcfile \"%s\" \ # -prefix \"%s\" --suffix .fastq --bol --mismatches 1", parser_demux_name = subparsers.add_parser('demux_name', description="Given a barcodes file listing sequence names \ and 'fake' barcode sequences, and a \ fasta/fastq file or a folder containing fasta/fastq files, splits each input fasta/fastq \ file into separate sample files (based on a sample-keyword in each sequence's name), \ for each sample name in the barcodes file. Sample files of size zero are ignored.") parser_demux_name.add_argument('-i', '--input_f', required=True, help="Input fasta/fastq file or folder.") parser_demux_name.add_argument('-b', '--barcodes', required=True, help="Tab delimted files of barcodes and corresponding sample names. Barcode sequences \ are ignored and can be faked. Only sample names will be read.") parser_demux_name.add_argument('-f', '--filetype', required=True, help="Indicates input and output filetype. \ Either 'fasta' or 'fastq'.") parser_demux_name.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.") parser_demux_name.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \ program to use. Choices are: 'chewbacca'. Default: 'chewbacca'.") parser_demux_name.set_defaults(command=Demux_Name_Command) # ================================================== # == Rename reads serially with renameSequences == # ================================================== # renameSequences(input, output) parser_rename = subparsers.add_parser('rename', description="Given a fasta/fastq file or directory of fasta/fastq \ files, serially renames each sequence in each file, with the filename as a \ prefix. e.g. the sequences in abc.fasta are renamed abc_ID0, abc_ID1, abc_ID2, etc.") parser_rename.add_argument('-i', '--input_f', required=True, help="Input fasta/fastq file or folder.") parser_rename.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.") parser_rename.add_argument('-f', '--filetype', required=True, help="The filetype of the input files. Either \ 'fasta' or 'fastq'.") parser_rename.add_argument('-c', '--clip', required=False, default=True, help="Set True if input file groups \ contain trailing identifiers from the demux_seqs step. e.g. True if file names contain \ '_0', '_1', '_2', etc.. Default: True.") parser_rename.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \ program to use. Choices are: 'chewbacca'. Default: 'chewbacca'.") parser_rename.set_defaults(command=Rename_Command) # ================================================= # == Trims barcodes and adapters using flexbar == # ================================================= # "flexbar": "flexbar -r \"%s\" -t \"%s\" -ae \"%s\" -a \"%s\"" parser_trim = subparsers.add_parser('trim_adapters', description="Given a single adapters file, a single \ adaptersrc file, and a fasta/fastq file or a folder containing fasta/fastq files, removes \ the specified adapters (and preceeding barcodes) from all sequences in the given \ fasta/fastq files.") parser_trim.add_argument('-i', '--input_f', required=True, help="Input fasta/fastq file or folder.") parser_trim.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.") parser_trim.add_argument('-a', '--adapters', required=True, help="Forwards Adapters file.") parser_trim.add_argument('-arc', '--adaptersrc', required=True, help="Reverse Complemented Adapters file.") parser_trim.add_argument('-p', '--program', required=False, default="flexbar", help="Indicates which \ program to use. Choices are: 'flexbar'. Default: 'flexbar'.") parser_trim.add_argument('-u', '--allowedns', required=False, default=0, type=int, help=" Flexbar: The number of \ unknown 'N' bases a sequence is allowed before being thrown out. Default: 0.") parser_trim.set_defaults(command=Clean_Adapters_Command) # ==================================== # == Clean Reads with Low Quality == # ==================================== # Clean low-quality reads with trimmomatic # "trimomatic": "java -jar ~/ARMS/programs/Trimmomatic-0.33/trimmomatic-0.33.jar SE \ # -phred33 input output_cleaned.fastq SLIDINGWINDOW:%windowsize:%minAvgQuality MINLEN:%minLen" parser_trimmomatic = subparsers.add_parser('clean_seqs', description="Given a single fastq file or a folder \ containing fastq files, trims away areas of low read quality (specified by -q) in each \ sequence. Then, the longest remaining segment (of at least length -m) is recorded as the \ new sequence.") parser_trimmomatic.add_argument('-i', '--input_f', required=True, help="Input fastq file or folder") parser_trimmomatic.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved") parser_trimmomatic.add_argument('-p', '--program', required=False, default="trimmomatic", help="Indicates which \ program to use. Choices are: 'trimmomatic'. Default: 'trimmomatic'.") parser_trimmomatic.add_argument('-m', '--minlen', type=int, default=200, help="Trimmomatic: Minimum length for cleaned sequences") parser_trimmomatic.add_argument('-w', '--windowSize', type=int, default=5, help="Trimmomatic: Size of the sliding window") parser_trimmomatic.add_argument('-q', '--quality', type=int, default=25, help="Trimmomatic: Minimum average quality for items in the sliding window") parser_trimmomatic.set_defaults(command=Clean_Quality_Command) # ========================================== # == Dereplicate sequences with vsearch == # ========================================== # " vsearch --threads %d --derep_fulllength %s --sizeout --fasta_width 0 --output %s -uc %s", parser_derep = subparsers.add_parser('dereplicate_fasta', description="Given an fasta file or folder, removes \ identical duplicates and subsequences WITHIN EACH FILE, keeping the longest sequence, and \ regroups it with the number of duplicates as '<longest_sequence_name>_<duplicate count>'.") parser_derep.add_argument('-i', '--input_f', required=True, help="Input fasta file or folder of fasta files.") parser_derep.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.") parser_derep.add_argument('-j', '--threads', required=False, type=int, default=2, help="Number of threads to use \ per query process.") parser_derep.add_argument('-g', '--groupsfile', required=False, help="A .groups file to update. If no .groups \ file is provided, then sequences are assumed to be singletons.") parser_derep.add_argument('-p', '--program', required=False, default="vsearch", help="Indicates which \ program to use. Choices are: 'vsearch'. Default: 'vsearch'.") parser_derep.add_argument('-s', '--stripcounts', required=False, type=bool, default=False, help="If included, \ strip counts from sequence groups before clustering. This allows for the recognition of \ sequence groups that are annotated with dereplication counts.") parser_derep.set_defaults(command=Dereplicate_Command) # ============================================ # == Partition fastas with splitKperFasta == # ============================================ # splitK(inputFasta, prefix, nbSeqsPerFile, filetype): parser_split = subparsers.add_parser('partition', description="Given a fasta/fastq file, or folder containing \ fasta/fastq files, splits each file into a set of sub files labeled \ <filename>_part_x.<ext>, where each subfile contains at most <--chunksize> sequences. \ (Where <ext> is the file extension of the original file.") parser_split.add_argument('-i', '--input_f', required=True, help="Input fasta/fastq file or folder.") parser_split.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.") parser_split.add_argument('-c', '--chunksize', type=int, required=True, help="Chunksize. (Max number of sequences \ per file).") parser_split.add_argument('-f', '--filetype', required=True, help="The filetype of the input files. Either \ 'fasta' or 'fastq'.") parser_split.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \ program to use. Choices are: 'chewbacca'. Default: 'chewbacca'.") parser_split.set_defaults(command=Partition_Command) # ======================= # == Merge the files == # ======================= # merge_files(input_file_list, output_file) parser_cat = subparsers.add_parser('merge_files', description="Given a folder containing any type of file, \ concatenates all files in that folder together, regardless of their filetype. Note: Any \ file headers/footers will be written as well.") parser_cat.add_argument('-i', '--input_f', required=True, help="Input directory containing files to concatenate.") parser_cat.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.") parser_cat.add_argument('-n', '--name', required=True, help="Prefix name for the merged file.") parser_cat.add_argument('-f', '--fileext', required=True, help="File extension for the output file.") parser_cat.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \ program to use. Choices are: 'chewbacca'. Default: 'chewbacca'.") parser_cat.set_defaults(command=Merge_Command) # ======================= # == ungap the files == # ======================= # ungap(file_to_clean, output_file_name, gap_char, file_type) parser_ungap = subparsers.add_parser('ungap_fasta', description="Given a fasta/fastq file or a folder containing \ fasta/fastq files, removes alignment characters (specified by -g).") parser_ungap.add_argument('-i', '--input_f', required=True, help="Input directory containing files to concatenate.") parser_ungap.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.") parser_ungap.add_argument('-f', '--fileext', required=True, help="File extension for the output file. Either \ 'fasta', or 'fastq'.") parser_ungap.add_argument('-g', '--gapchars', required=True, help="A string of one or more characters to remove \ from the sequences (but not sequence groups) in the input files.") parser_ungap.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \ program to use. Choices are: 'chewbacca'. Default: 'chewbacca'.") parser_ungap.set_defaults(command=Ungap_Command) # ============================== # == Align Reads with MACSE == # ============================== # "macse_align": "java -jar " + programPaths["MACSE"] + " -prog enrichAlignment -seq \"%s\" -align \ # \"%s\" -seq_lr \"%s\" -maxFS_inSeq 0 -maxSTOP_inSeq 0 -maxINS_inSeq 0 \ # -maxDEL_inSeq 3 -gc_def 5 -fs_lr -10 -stop_lr -10 -out_NT \"%s\"_NT \ # -out_AA \"%s\"_AA -seqToAdd_logFile \"%s\"_log.csv", parser_align = subparsers.add_parser('macseAlign', description="Aligns sequences using the 'enrichAlignment' \ program in Macse. Removes chimeras, invalid sequences, and orients sequences in the \ forward read direction.") parser_align.add_argument('-i', '--input_f', required=True, help="Input fasta") parser_align.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved") parser_align.add_argument('-d', '--db', required=True, help="Database against which to align and filter reads") parser_align.add_argument('-p', '--program', required=False, default="macse", help="Indicates which \ program to use. Choices are: 'macse'. Default: 'macse'.") parser_align.set_defaults(command=Clean_Deep_Command) # ===================================== # == Clean Aligned Reads with MACSE == # ===================================== # "macse_format": "java -jar " + programPaths["MACSE"] + " -prog exportAlignment -align \"%s\" \ # -charForRemainingFS - -gc_def 5 -out_AA \"%s\" -out_NT \"%s\" -statFile \"%s\"", parser_macse_clean = subparsers.add_parser('macseClean', description="Removes gap characters inserted by the macse \ aligner.") parser_macse_clean.add_argument('-i', '--input_f', required=True, help="Input fasta") parser_macse_clean.add_argument('-s', '--samplesdir', required=True, help="Samples dir") parser_macse_clean.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved") parser_macse_clean.add_argument('-d', '--db', required=True, help="Database against which to align and filter \ reads") parser_macse_clean.add_argument('-p', '--program', required=False, default="macse", help="Indicates which \ program to use. Choices are: 'macse'. Default: 'macse'.") parser_macse_clean.set_defaults(command=Clean_Deep_Repair_Command) # ============================================ # == Cluster using CROP, SWARM, OR VSEARCH == # ============================================ parser_cluster = subparsers.add_parser('cluster_seqs', description="Given a fasta file or a folder containing \ fasta files, performs clustering on each input file individually. Outputs a .groups file \ listing unique representative sequences from each cluster, and the groups of the\ sequences they represent. Also outputs a <input_file_name>_seeds.fasta file of the unique \ representatives.") parser_cluster.add_argument('-i', '--input_f', required=True, help="Input fasta file/folder") parser_cluster.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved") parser_cluster.add_argument('-p', '--program', required=False, default="swarm", help="One of 'crop', 'swarm', or \ 'vsearch' to indicate which clustering program to use. Default: 'swarm'.") parser_cluster.add_argument('-g', '--groupsfile', required=False, help="A .groups file to update.") parser_cluster.add_argument('-s', '--stripcounts', required=False, type=bool, default=True, help="If True, strip \ counts from sequence groups before clustering. This allows for the recognition of \ group names in cases where dereplication counts have been added to group names.") # CROP options parser_cluster.add_argument('-z', '--blocksize', required=False, type=int, default=500, help="CROP only: Size of \ blocks to be used for all rounds (if -b is specified, then -z will not affect the first \ round. For data set with different average sequence length, this parameter should be \ tuned such that it won't take too long for each block to do pariwise alignment. Hint for \ choosing z: z*L<150,000, where L is the average length of the sequences. Default: 500.") parser_cluster.add_argument('-b', '--blockcount', required=False, type=int, help="CROP only: The size of blocks in \ the first round of clustering. Hint of choosing -b: Each block in the first round should \ contain about 50 sequences. i.e. b=N/50, where N is the number of input sequences. \ Default: # input sequences / z.") parser_cluster.add_argument('-e', '--maxmcmc', required=False, type=int, default=2000, help="CROP only: This \ parameter specifies the number of iterations of MCMC. Default value is 2000. Increase this \ value to enhance accuracy (recommended value is at least 10*block size).") parser_cluster.add_argument('-c', '--clustpct', required=False, default="g", help="CROP only: The minimum \ similarity threshold for clustering. Either 'g' for 95%% or 's' for 97%%. Default: 'g'.") parser_cluster.add_argument('-m', '--maxsm', required=False, type=int, default=20, help="CROP only: This parameter \ specifies the maximum number of 'split and merge' process to run. Default value is 20, \ which is also the maximum allowed.") parser_cluster.add_argument('-r', '--rare', required=False, type=int, default=2, help="CROP only: The maximum \ cluster size allowed to be classified as 'rare'. Clusters are defined as either \ 'abundant' or 'rare'. 'Abundant' clusters will be clustered first, then the 'rare' \ clusters are mapped to the 'abundant' clusters. Finally, 'rare' clusters which cannot be \ mapped will be clustered separately. e.g. If r=5, the clusters with size <=5 will be \ considered 'rare' in above procedure. and r=0 will yield the best accuracy. If you \ believe your data is not too diverse to be handled, then r=0 will be the best choice. \ Default: 2.") # Vsearch options parser_cluster.add_argument('-v', '--idpct', required=False, type=float, default=.95, help="VSEARCH only: %% match \ required for clustering. Real number in the range (0,1]. Default: 0.95") parser_cluster.set_defaults(command=Cluster_Command) # ================================================ # == Closed Ref Picking with fasta references == # ================================================ # --usearch_global ../9_p_uchime/seeds.pick.fasta --db ../data/BiocodePASSED_SAP.txt --id 0.9 \ # --userfields query+target+id+alnlen+qcov --userout out --alnout alnout.txt parser_query_fasta = subparsers.add_parser('query_fasta', description="Given a fasta file, or folder containing \ fasta files, aligns and identifies OTUs against a curated fasta file.") parser_query_fasta.add_argument('-i', '--input_f', required=True, help="Input file/folder with fasta files") parser_query_fasta.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved") parser_query_fasta.add_argument('-r', '--referencefasta', required=True, help="Filepath to the curated fasta file \ to use as a reference.") parser_query_fasta.add_argument('-x', '--taxinfo', required=True, help="Filepath to a two-column, tab-delimited \ file mapping a sequence's fasta id (in the referencefasta file) to a taxonomic \ identification.") parser_query_fasta.add_argument('-s', '--simmilarity', required=False, default=0.97, type=float, help="Minimum %% \ simmilarity (decimal between 0 and 1) between query and reference sequences required for \ positive identification. Default: .97") parser_query_fasta.add_argument('-p', '--program', required=False, default="vsearch", help="Indicates which \ program to use. Choices are: 'vsearch'. Default: 'vsearch'.") parser_query_fasta.add_argument('-c', '--coverage', required=False, default=0.85, type=float, help="Minimum %% coverage \ (decimal between 0 and 1) required query and reference sequences required for positive \ identification. Default: .85") parser_query_fasta.add_argument('-j', '--threads', required=False, type=int, default=2, help="Number of threads to use per query process.") parser_query_fasta.set_defaults(command=Query_OTU_Fasta_Command) # ==================================== # == Closed Ref Picking with NCBI == # ==================================== # --usearch_global ../9_p_uchime/seeds.pick.fasta --db /home/mahdi/refs/COI_DOWNLOADED/COI.fasta -id 0.9 \ # --userfields query+target+id+alnlen+qcov --userout out --alnout alnout.txt # --userfields query+target+id+alnlen+qcov parser_query_db = subparsers.add_parser('query_db', description="Given a fasta file, or folder containing \ fasta files, aligns and identifies OTUs against a curated database.") parser_query_db.add_argument('-i', '--input_f', required=True, help="Input Fasta File to clean") parser_query_db.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved") parser_query_db.add_argument('-r', '--referencefasta', required=True, help="Filepath to the curated fasta file \ to use as a reference.") parser_query_db.add_argument('-d', '--db', required=True, help="Filepath to the curated fasta file \ to use as a reference.") parser_query_db.add_argument('-s', '--simmilarity', required=False, default=0.97, type=float, help="Minimum %% \ simmilarity (decimal between 0 and 1) between query and reference sequences \ required for positive identification. Default: .97") parser_query_db.add_argument('-c', '--coverage', required=False, default=0.85, type=float, help="Minimum %% coverage \ (decimal between 0 and 1) required query and reference sequences required for \ positive identification. Default: .85") parser_query_db.add_argument('-j', '--threads', required=False, type=int, default=2, help="Number of threads to use per query process.") parser_query_db.add_argument('-p', '--program', required=False, default="vsearch", help="Indicates which \ program to use. Choices are: 'vsearch'. Default: 'vsearch'.") parser_query_db.set_defaults(command=Query_OTU_DB_Command) # ======================= # == Build OTU table == # ======================= parser_build_matrix = subparsers.add_parser('build_matrix', description="Given a single barcodes file with all possible \ sample groups, a list of the latest groups file(s), and a list of initial samples files \ (mapping each original, undereplicated sequence to its sample name), builds an OTU \ table.. ") parser_build_matrix.add_argument('-s', '--samples', required=True, help="Input samples file or folder.") parser_build_matrix.add_argument('-g', '--groups', required=True, help="Input groups file or folder.") parser_build_matrix.add_argument('-b', '--barcodes', required=True, help="Input barcodes file.") parser_build_matrix.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.") parser_build_matrix.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \ program to use. Choices are: 'chewbacca'. Default: 'chewbacca'.") parser_build_matrix.set_defaults(command=Build_OTU_Table_Command) # ======================= # == Annotate Matrix == # ======================= parser_annotate_matrix = subparsers.add_parser('annotate_matrix', description="Given a tabular file mapping \ sequence IDs to taxonomic groups, and an OTU matrix, regroups the identifiable \ sequence IDs with taxonomic groups.") parser_annotate_matrix.add_argument('-i', '--input_f', required=True, help="Input matrix file or folder of matrix files.") parser_annotate_matrix.add_argument('-a', '--annotation', required=True, help="File mapping sequence IDs to taxonomic groups.") parser_annotate_matrix.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.") parser_annotate_matrix.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \ program to use. Choices are: 'chewbacca'. Default: 'chewbacca'.") parser_annotate_matrix.set_defaults(command=Annotate_OTU_Table_Command) # ============================== # == Convert fastq to fasta == # ============================== # translateFastqToFasta(inputFastQ, outputFasta): parser_to_fasta = subparsers.add_parser('convert_fastq_to_fasta', description="Converts a fastq file to a fasta \ file.") parser_to_fasta.add_argument('-i', '--input_f', required=True, help="Fastq file or folder containing fastq files to \ translate") parser_to_fasta.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.") parser_to_fasta.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \ program to use. Choices are: 'chewbacca'. Default: 'chewbacca'.") parser_to_fasta.set_defaults(command=Convert_Fastq_Fasta_Command) # =========================================== # == Visualize Samples as OTU Composition == # =========================================== parser_viz_otu_comp = subparsers.add_parser('visualize_otu_sample_composition', description="Creates a stacked \ barchart showing the OTU composition in each sample.") parser_viz_otu_comp.add_argument('-i', '--input_f', required=True, help="Filepath to the OTU table to visualize. \ Input OTU tables should start with a tab-delimited header row of samplenames, prefixed by \ the word \"OTU\". Each subsequent line should be a tab-delimited line listing an OTU, \ followed by its abundance in each Sample.\ne.g.:\n\ OTU <Sample_name1> <Sample_name2> <Sample_name3>...\n\ otu_name1 0 1 5...\n\ otu_name2 1 2 0...\n\ otu_name3 3 1 1...\n") parser_viz_otu_comp.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.") parser_viz_otu_comp.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \ program to use. Choices are: 'chewbacca'. Default: 'chewbacca'.") group = parser_viz_otu_comp.add_mutually_exclusive_group(required=False) group.add_argument('-m', '--pct', help="Real number x in the range (0,1] indicating that the top x%% of OTU names \ (sorted by abundance) should be included in the graph.", type=float) group.add_argument('-n', '--names', help="Filepath to a list of OTU names to include. File should be formatted as \ a series of lines where each line contains just an OTU name.") group.add_argument('-k', '--count', help="Positive integer x indicating that x OTU names (sorted highest \ abundance) should be included in the graph.", type=int) group.set_defaults(command=Visualize_OTU_Sample_Composition_Command) # =========================== # == Visualize OTU Heatmap == # =========================== parser_viz_otu_heatmap = subparsers.add_parser('visualize_otu_heatmap', description="Creates a heatmap showing the \ abundance of each OTU at each sample site.") parser_viz_otu_heatmap.add_argument('-i', '--input_f', required=True, help="Filepath to the OTU table to visualize. \ Input OTU tables should start with a tab-delimited header row of samplenames, prefixed by \ the word \"OTU\". Each subsequent line should be a tab-delimited line listing an OTU, \ followed by its abundance in each Sample.\ne.g.:\n\ OTU <Sample_name1> <Sample_name2> <Sample_name3>...\n\ otu_name1 0 1 5...\n\ otu_name2 1 2 0...\n\ otu_name3 3 1 1...\n") parser_viz_otu_heatmap.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.") parser_viz_otu_heatmap.add_argument('-p', '--program', required=False, default="chewbacca", help="Indicates which \ program to use. Choices are: 'chewbacca'. Default: 'chewbacca'.") group = parser_viz_otu_heatmap.add_mutually_exclusive_group(required=False) group.add_argument('-m', '--pct', help="Real number x in the range (0,1] indicating that the top x%% of OTU names \ (sorted by abundance) should be included in the graph.", type=float) group.add_argument('-n', '--names', help="Filepath to a list of OTU names to include. File should be formatted as \ a series of lines where each line contains just an OTU name.") group.add_argument('-k', '--count', help="Positive integer x indicating that x OTU names (sorted highest \ abundance) should be included in the graph.", type=int) parser_viz_otu_heatmap.set_defaults(command=Visualize_OTU_Heatmap_Command) #============================================================================================= """ # TEST from classes.ChewbaccaCommand import ChewbaccaCommand from classes.ChewbaccaProgram import ChewbaccaProgram from classes.ProgramRunner import ProgramRunner, ProgramRunnerCommands class Test_Program_Chewbacca(ChewbaccaProgram): name = "test" def execute_program(self): args = self.args p = ProgramRunner(ProgramRunnerCommands.TEST_ECHO, [args.input_f]) p.run() class Test_Command(ChewbaccaCommand): default_program = Test_Program_Chewbacca supported_programs = [Test_Program_Chewbacca] command_name = "Test" test_parser = subparsers.add_parser('test', description="test.") test_parser.add_argument('-i', '--input_f', required=True, help="File") test_parser.add_argument('-o', '--outdir', required=True, help="Directory where outputs will be saved.") test_parser.set_defaults(command=Test_Command) """ # ============================================================================================= # ======================================= # == Parse args and call default func == # ======================================= args, unknown = parser.parse_known_args() if unknown: print "\nIgnoring unknown args: " + ', '.join(['%s'] * len(unknown)) % tuple(unknown) if args.verbose: logging.basicConfig(format=FORMAT, level=logging.DEBUG, datefmt=DATEFMT) else: logging.basicConfig(format=FORMAT, level=logging.ERROR, datefmt=DATEFMT) printVerbose.VERBOSE = (args.verbose is not None) logging.debug("Initial ARGS are: %s", args) print("\t\t") signal.signal(signal.SIGTSTP, signal.SIG_IGN) makeDirOrdie(args.outdir) args.command(args).execute_command()
def cluster_crop(self, input_f, outdir, groupsfile, processes, blocksize, clustpct, maxmcmc, maxsm, rare, blockcount, extraargstring): """Clusters sequences using CROP. :param input_f: Filepath to the input fasta file to cluster. :param outdir: Filepath to the output directory. :param groupsfile: Filepath to the groups file to use as a reference for dereplication counting. :param blocksize: Size of blocks to be used for all rounds (if -b is specified, then -z will not affect the first round. For data set with different average sequence length, this parameter should \ be tuned such that it won't take too long for each block to do pariwise alignment. Hint \ for choosing z: z*L<150,000, where L is the average length of the sequences. :param clustpct: The minimum similarity threshold for clustering. Either 'g' for 95% or 's' for 97%. :param maxmcmc: This parameter specifies the number of iterations of MCMC. Default value is 2000. Increase \ this value to enhance accuracy (recommended value is at least 10*block size). :param maxsm: This parameter specifies the maximum number of 'split and merge' process to run. Max is 20. :param rare: The maximum cluster size allowed to be classified as 'rare'. Clusters are defined as either \ 'abundant' or 'rare'. 'Abundant' clusters will be clustered first, then the 'rare' \ clusters are mapped to the 'abundant' clusters. Finally, 'rare' clusters which cannot be \ mapped will be clustered separately. e.g. If r=5, the clusters with size <=5 will be \ considered 'rare' in above procedure. and r=0 will yield the best accuracy. If you \ believe your data is not too diverse to be handled, then r=0 will be the best choice. :param blockcount: The size of blocks in the first round of clustering. Hint of choosing -b: Each block in the \ first round should contain about 50 sequences. i.e. b=N/50, where N is the number of \ input sequences. Default: # input sequences / z. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ # Grab the fasta file(s) to cluster inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "clustered") pool = init_pool(min(len(inputs), processes)) # RUN CLUSTERING # crop -i %s -o %s -z %s -c %s -e %s -m %s%s run_parallel([ProgramRunner(ProgramRunnerCommands.CLUSTER_CROP, [input_, "%s/%s" % (outdir, strip_ixes(input_)), blocksize, clustpct, maxmcmc, maxsm, rare, blockcount], {"exists": [input_]}, extraargstring) for input_ in inputs], pool) # CLEAN THE OUTPUT GROUPS FILE printVerbose("Parsing the groups file from clustering") clustered_groups_files = getInputFiles(outdir, "*.cluster.list") debugPrintInputInfo(clustered_groups_files, "converted to groups files") run_parallel([PythonRunner(parseCROPoutToGroups, [input_, "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in clustered_groups_files], pool) printVerbose("Done parsing groups file.") # Collect the groups file from clustering with counts removed cleaned_clustered_groups_files = getInputFiles(outdir, "*_uncount.groups", ignore_empty_files=False) # Resolve the user specified names file if necessary final_groups_files = handle_groups_file_update(outdir, groupsfile, cleaned_clustered_groups_files) # GATHER AUX FILES input_dir = getDirName(input_f) aux_files = cleaned_clustered_groups_files aux_files += getInputFiles(input_dir, "*.unique", ignore_empty_files=False) aux_files += getInputFiles(input_dir, "*.unique.list", ignore_empty_files=False) aux_files += getInputFiles(input_dir, "*.unique.TempCenters.Rare", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.cluster", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.cluster.list", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.log", ignore_empty_files=False) aux_files += getInputFiles(".", "LikelihoodRatio.txt", ignore_empty_files=False) # Move the final groups file(s) to the groups dir groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(final_groups_files, groups_dir) # Move aux files to the aux dir aux_dir = makeAuxDir(outdir) bulk_move_to_dir(aux_files, aux_dir) # Cleanup the pool cleanup_pool(pool)
def preprocessData(args, pool=Pool(processes=1)): # TODO: test run.name is a single word makeDirOrdie(args.outDir) printVerbose("Preprocessing the data:") # ***************************************************************************************** printVerbose("\t Renaming sequences") # "~/programs/fastx/bin/fastx_renamer -n COUNT -i %s %s" rename_outFile_f = os.path.join( "outDir/", os.path.basename(args.input_f) + "_renamed") rename_outFile_r = os.path.join( "outDir/", os.path.basename(args.input_r) + "_renamed") pool.map(runInstance, [ ProgramRunner("fastx_renamer", [args.input_f, rename_outFile_f], {"exists": [args.input_f]}), ProgramRunner("fastx_renamer", [args.input_r, rename_outFile_r], {"exists": [args.input_r]}), ]) printVerbose("\tRenamed X sequences") # ***************************************************************************************** # Making the contigs using Pear # "~/programs/pear-0.9.4-bin-64/pear-0.9.4-64 -f %s -r %s -o %s -j %s" assembledPrefix = os.path.join("outDir", args.name) pool.map(runInstance, [ ProgramRunner("pear", (rename_outFile_f, rename_outFile_r, assembledPrefix, args.threads), {"exists": [rename_outFile_f, rename_outFile_r]}) ]) assembledFastqFile = os.path.join("outDir", args.name + ".assembled.fastq") # add py char to a web-page printVerbose( "\t %s sequences assembled, %s contigs discarded, %s sequences discarded" % (1, 1, 1)) # ***************************************************************************************** # Converting fastq to fasta file (do with mothur or BioSeqIO to keep prog deps to a minimum) pool.map(runInstance, [ ProgramRunner("fastq.info", [assembledFastqFile], {"exists": [assembledFastqFile]}) ]) assembledFastaFile = os.path.splitext(assembledFastqFile)[0] + ".fasta" # TODO: add py char to a web-page printVerbose("\t converted fastq to fasta") # ***************************************************************************************** # Trimming and assigning reads to groups # trim.seqs(fasta=%, oligos=%s, maxambig=0, maxhomop=8, minlength=300, maxlength=550, bdiffs=1, pdiffs=2) pool.map(runInstance, [ ProgramRunner("trim.seqs", [assembledFastaFile, args.barcodes], {"exists": [assembledFastaFile]}) ]) printVerbose( "\t %s sequences were assigned to groups and %s sequences were discareded" ) trimmedFasaFile = os.path.splitext(assembledFastqFile)[0] + ".trim.fasta" # ***************************************************************************************** # Aligning against the BIOCODETEMPLATE database pool.map(runInstance, [ ProgramRunner("align.seqs", [trimmedFasaFile, args.db], {"exists": [trimmedFasaFile]}) ]) printVerbose( "\t %s sequences were assigned to groups and %s sequences were discareded" )
def dereplicate_vsearch(self, input_f, outdir, groupsfile, processes, stripcounts, extraargstring): """Dereplicates with vsearch. :param input_f: Filepath to the file or folder of files to dereplicate. :param outdir: Filepath to the output directory. :param groupsfile: A groups file to use as a reference for replicant counting. If no groups file is provided, input sequences are conidered singletons (regardless of their name-annotated dereplication count). :param processes: The number of processes to use to dereplicate the fileset. :param stripcounts: If True, strips the trailing dereplication counts from a file before dereplication. :param extraargstring: Advanced program parameter string. """ inputs = getInputFiles(input_f) pool = init_pool(min(len(inputs), processes)) # REMOVES COUNTS FROM SEQUENCE NAMES IN ORDER TO CLUSTER PROPERLY # strip counts if we need to. if stripcounts: printVerbose("Removing counts from sequence names...") debugPrintInputInfo(inputs, "renamed") run_parallel([PythonRunner(removeCountsFromFastFile, [input_, "%s/%s_uncount.fasta" % (outdir, strip_ixes(input_)), 'fasta'], {"exists": input_}) for input_ in inputs], pool) printVerbose("Done removing counts.") # Grab the cleaned files as input for the next step inputs = getInputFiles(outdir, "*_uncount.fasta") # DEREPLICATE debugPrintInputInfo(inputs, "dereplicated") printVerbose("Dereplicating...") run_parallel([ProgramRunner(ProgramRunnerCommands.DEREP_VSEARCH, [processes, input_, "%s/%s_derep.fasta" % (outdir, strip_ixes(input_)), "%s/%s_uc.out" % (outdir, strip_ixes(input_))], {"exists": [input_], "positive": [processes]}, extraargstring) for input_ in inputs], pool) printVerbose("Done dereplicating") # LOG DEREPLICATED SEQUENCES INTO A .GROUPS FILE # generates a .groups file named _uc_parsed.out # python parseUCtoGroups.py uc.out uc_parsed.out input_ucs = getInputFiles(outdir, "*_uc.out") printVerbose("Generating a groups file from dereplication.") debugPrintInputInfo(inputs, "parsed (into a .groups file)") run_parallel([PythonRunner(parseUCtoGroups, [input_, "%s/%s_derep.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in input_ucs], pool) most_recent_groups_files = getInputFiles(outdir, "*_derep.groups", ignore_empty_files=False) # UPDATE THE MOST CURRENT GROUPS FILES WITH DEREPLICATION COUNTS if groupsfile is not None: # Grab the oldgroups file and the dereplicated groups file old_groups_files = getInputFiles(groupsfile) derep_groups_files = getInputFiles(outdir, "*_derep.groups") printVerbose("Updating .groups files with dereplicated data") printVerbose("%d Reference (old) groups files to be read:" % len(old_groups_files)) printVerbose(str(old_groups_files)) printVerbose("%d Dereplicated (new) groups files to be read:" % len(derep_groups_files)) printVerbose(str(derep_groups_files)) update_groups(old_groups_files, derep_groups_files, outdir, "dereplicated") most_recent_groups_files = getInputFiles(outdir, "dereplicated*", ignore_empty_files=False) printVerbose("Done updating .groups files.") if len(inputs) != len(most_recent_groups_files): print ("Error: Number of input fastas (%d) is not equal to the number ofgroups files (%d)." % (len(inputs), len(most_recent_groups_files))) exit() fasta_groups_pairs = zip(inputs, most_recent_groups_files) # ADD COUNT TO SEQUENCE NAMES AND SORT BY COUNT # python renameWithReplicantCounts.py # 8_macse_out/MACSEOUT_MERGED.fasta uc_parsed.out dereplicated_renamed.fasta printVerbose("Adding dereplication data to unique fasta") run_parallel([PythonRunner(renameWithReplicantCounts, [fasta, groups, "%s/%s_counts.fasta" % (outdir, strip_ixes(fasta)), 'fasta'], {"exists": [fasta, groups]}) for fasta, groups in fasta_groups_pairs], pool) printVerbose("Done adding data") aux_dir = makeAuxDir(outdir) groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(most_recent_groups_files, groups_dir) aux_files = getInputFiles(outdir, '*', "*_counts.fasta", ignore_empty_files=False) bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)
def cluster_crop(self, input_f, outdir, groupsfile, processes, blocksize, clustpct, maxmcmc, maxsm, rare, blockcount, extraargstring): """Clusters sequences using CROP. :param input_f: Filepath to the input fasta file to cluster. :param outdir: Filepath to the output directory. :param groupsfile: Filepath to the groups file to use as a reference for dereplication counting. :param blocksize: Size of blocks to be used for all rounds (if -b is specified, then -z will not affect the first round. For data set with different average sequence length, this parameter should \ be tuned such that it won't take too long for each block to do pariwise alignment. Hint \ for choosing z: z*L<150,000, where L is the average length of the sequences. :param clustpct: The minimum similarity threshold for clustering. Either 'g' for 95% or 's' for 97%. :param maxmcmc: This parameter specifies the number of iterations of MCMC. Default value is 2000. Increase \ this value to enhance accuracy (recommended value is at least 10*block size). :param maxsm: This parameter specifies the maximum number of 'split and merge' process to run. Max is 20. :param rare: The maximum cluster size allowed to be classified as 'rare'. Clusters are defined as either \ 'abundant' or 'rare'. 'Abundant' clusters will be clustered first, then the 'rare' \ clusters are mapped to the 'abundant' clusters. Finally, 'rare' clusters which cannot be \ mapped will be clustered separately. e.g. If r=5, the clusters with size <=5 will be \ considered 'rare' in above procedure. and r=0 will yield the best accuracy. If you \ believe your data is not too diverse to be handled, then r=0 will be the best choice. :param blockcount: The size of blocks in the first round of clustering. Hint of choosing -b: Each block in the \ first round should contain about 50 sequences. i.e. b=N/50, where N is the number of \ input sequences. Default: # input sequences / z. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ # Grab the fasta file(s) to cluster inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "clustered") pool = init_pool(min(len(inputs), processes)) # RUN CLUSTERING # crop -i %s -o %s -z %s -c %s -e %s -m %s%s run_parallel([ ProgramRunner(ProgramRunnerCommands.CLUSTER_CROP, [ input_, "%s/%s" % (outdir, strip_ixes(input_)), blocksize, clustpct, maxmcmc, maxsm, rare, blockcount ], {"exists": [input_]}, extraargstring) for input_ in inputs ], pool) # CLEAN THE OUTPUT GROUPS FILE printVerbose("Parsing the groups file from clustering") clustered_groups_files = getInputFiles(outdir, "*.cluster.list") debugPrintInputInfo(clustered_groups_files, "converted to groups files") run_parallel([ PythonRunner(parseCROPoutToGroups, [ input_, "%s/%s_uncount.groups" % (outdir, strip_ixes(input_)) ], {"exists": [input_]}) for input_ in clustered_groups_files ], pool) printVerbose("Done parsing groups file.") # Collect the groups file from clustering with counts removed cleaned_clustered_groups_files = getInputFiles( outdir, "*_uncount.groups", ignore_empty_files=False) # Resolve the user specified names file if necessary final_groups_files = handle_groups_file_update( outdir, groupsfile, cleaned_clustered_groups_files) # GATHER AUX FILES input_dir = getDirName(input_f) aux_files = cleaned_clustered_groups_files aux_files += getInputFiles(input_dir, "*.unique", ignore_empty_files=False) aux_files += getInputFiles(input_dir, "*.unique.list", ignore_empty_files=False) aux_files += getInputFiles(input_dir, "*.unique.TempCenters.Rare", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.cluster", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.cluster.list", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.log", ignore_empty_files=False) aux_files += getInputFiles(".", "LikelihoodRatio.txt", ignore_empty_files=False) # Move the final groups file(s) to the groups dir groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(final_groups_files, groups_dir) # Move aux files to the aux dir aux_dir = makeAuxDir(outdir) bulk_move_to_dir(aux_files, aux_dir) # Cleanup the pool cleanup_pool(pool)
def cluster_swarm(self, input_f, outdir, groupsfile, processes, extraargstring): """Clusters sequences using SWARM. :param input_f: A file or folder containing fasta files to cluster. :param outdir: The output directory results will be written to. :param groupsfile: A groups file or folder containing groups files that describe the input. Note: if no groups file is supplied, then entries in the fasta file are assumed to be singleton sequences. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ # Grab the fasta file(s) to cluster inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "clustered") pool = init_pool(min(len(inputs), processes)) # RUN CLUSTERING run_parallel([ ProgramRunner(ProgramRunnerCommands.CLUSTER_SWARM, [ input_, "%s/%s_clustered" % (outdir, strip_ixes(input_)), "%s/%s_clustered_uc" % (outdir, strip_ixes(input_)), "%s/%s_clustered_seeds" % (outdir, strip_ixes(input_)) ], {"exists": [input_]}, extraargstring) for input_ in inputs ], pool) # PARSE UC FILE TO GROUPS FILE printVerbose("Parsing the clustered uc files to groups files") clustered_uc_files = getInputFiles(outdir, "*_clustered_uc") debugPrintInputInfo(clustered_uc_files, "parsed to groups") run_parallel([ PythonRunner( parseUCtoGroups, [input_, "%s/%s.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in clustered_uc_files ], pool) printVerbose("Done parsing groups files.") # REMOVE COUNTS FROM CLUSTERING GROUPS FILE printVerbose("Cleaning the .groups file from clustering") # Grab the current groups file and the new clustered groups file (which needs to be cleaned) clustered_groups_files = getInputFiles(outdir, "*_clustered.groups") debugPrintInputInfo(clustered_groups_files, "cleaned") run_parallel([ PythonRunner(removeCountsFromGroupsFile, [ input_, "%s/%s_uncount.groups" % (outdir, strip_ixes(input_)) ], {"exists": [input_]}) for input_ in clustered_groups_files ], pool) printVerbose("Done cleaning groups files.") printVerbose("Capitalizing sequences") # Convert the seeds files to uppercase (swarm writes in lowercase) inputs = getInputFiles(outdir, "*_seeds") run_parallel([ PythonRunner(capitalize_seqs, [input_, "%s.fasta" % input_], {"exists": [input_]}) for input_ in inputs ], pool) printVerbose("Done capitalizing sequences") # Collect the groups file from clustering with counts removed cleaned_clustered_groups_files = getInputFiles( outdir, "*_uncount.groups", ignore_empty_files=False) # Resolve the user specified names file if necessary final_groups_files = handle_groups_file_update( outdir, groupsfile, cleaned_clustered_groups_files) # Move the final groups file(s) to the groups dir groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(final_groups_files, groups_dir) # Move aux files to the aux dir aux_files = getInputFiles(outdir, "*", "*_seeds.fasta", ignore_empty_files=False) aux_dir = makeAuxDir(outdir) bulk_move_to_dir(aux_files, aux_dir) # Cleanup the pool cleanup_pool(pool)
def cluster_vsearch(self, input_f, outdir, groupsfile, processes, idpct, extraargstring): """Clusters sequences using SWARM. :param input_f: A file or folder containing fasta files to cluster. :param outdir: The output directory results will be written to. :param groupsfile: A groups file or folder containinggroups files that describe the input. Note: if no groups file is supplied, then entries in the fasta file are assumed to be singleton sequences. :param idpct: Real number in the range (0,1] that specifies the minimum simmilarity threshold for clustering. e.g. .95 indicates that a candidate sequence 95% must be at least 95% simmilar to the seed sequence to be included in the cluster. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ # Grab the fasta file(s) to cluster inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "clustered") pool = init_pool(min(len(inputs), processes)) # RUN CLUSTERING # " --cluster_size %s -id %f --centroids %s --uc %s", run_parallel([ProgramRunner(ProgramRunnerCommands.CLUSTER_VSEARCH, [input_, float(idpct), "%s/%s_seeds.fasta" % (outdir, strip_ixes(input_)), "%s/%s_clustered_uc" % (outdir, strip_ixes(input_))], {"exists": [input_]}, extraargstring) for input_ in inputs], pool) # PARSE UC FILE TO GROUPS FILE printVerbose("Parsing the clustered uc files to groups files") clustered_uc_files = getInputFiles(outdir, "*_clustered_uc") debugPrintInputInfo(clustered_uc_files, "parsed to groups") run_parallel([PythonRunner(parseUCtoGroups, [input_, "%s/%s.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in clustered_uc_files], pool) # REMOVE COUNTS FROM CLUSTERING GROUPS FILE printVerbose("Cleaning the .groups file from clustering") # Grab the current groups file and the new clustered groups file (which needs to be cleaned) clustered_groups_files = getInputFiles(outdir, "*_clustered.groups") # Remove counts from the clustering groups files debugPrintInputInfo(clustered_groups_files, "cleaned") run_parallel([PythonRunner(removeCountsFromGroupsFile, [input_, "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in clustered_groups_files], pool) printVerbose("Done cleaning groups files.") # Collect the groups file from clustering with counts removed cleaned_clustered_groups_files = getInputFiles(outdir, "*_uncount.groups", ignore_empty_files=False) # Resolve the user specified names file if necessary final_groups_files = handle_groups_file_update(outdir, groupsfile, cleaned_clustered_groups_files) # Move the final groups file(s) to the groups dir groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(final_groups_files, groups_dir) # Move aux files to the aux dir aux_files = getInputFiles(outdir, "*", "*_seeds.fasta", ignore_empty_files=False) aux_dir = makeAuxDir(outdir) bulk_move_to_dir(aux_files, aux_dir) # Cleanup the pool cleanup_pool(pool)