def demux_fastx(self, input_f, barcodes, outdir, processes, extraargstring): """Demuxes using FAST X BARCODE SPLITTER. :param input_f: File path to input file or folder of input files. :param barcodes: File path to input barcodes file. :param outdir: Filepath to output directory. :param processes: Number of processes to use to demux input fileset. :param extraargstring: Advanced program parameter string. """ # Get input files files_to_split = getInputFiles(input_f) # Assign the files shard numbers file_id = range(len(files_to_split)) file_id_pairs = zip(files_to_split, file_id) debugPrintInputInfo(files_to_split, "demux") pool = init_pool(min(len(file_id_pairs), processes)) printVerbose("Demuxing sequences...") run_parallel([ProgramRunner(ProgramRunnerCommands.DEMUX_FASTX, [input_, barcodes, "%s/" % outdir, "_%d_demux.fastq" % id_], {"exists": [input_, barcodes]}, extraargstring) for input_, id_ in file_id_pairs], pool) printVerbose("Demuxed sequences.") # Grab all the auxillary files aux_files = getInputFiles(outdir, "unmatched_*", ignore_empty_files=False) # make aux dir for extraneous files and move them there bulk_move_to_dir(aux_files, makeAuxDir(outdir)) cleanup_pool(pool)
def clean_quality_trimmomatic(self, input_f, outdir, window_size, quality, min_len, processes, extraargstring): """Uses a sliding window to identify and trim away areas of low quality. :param input_f: Filepath to input file or folder. :param outdir: Filepath to the output directory. :param window_size: Width of the sliding window. (Number of consecutive base-pairs to average for quality \ analysis). :param quality: Minimum quality allowed. Sections with lower average quality than this will be dropped. :param min_len: Minimum allowed length for TRIMMED sequences. (i.e. if a sequence is too short after trimming, its dropped.) :param processes: Number of processes to use to clean the input fileset. """ # "trimomatic": "java -jar ~/ARMS/programs/Trimmomatic-0.33/trimmomatic-0.33.jar SE \ # -%phred %input %output SLIDINGWINDOW:%windowsize:%minAvgQuality MINLEN:%minLen" inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "clean") pool = init_pool(min(len(inputs), processes)) printVerbose("Cleaning sequences with Trimmomatic...") run_parallel([ ProgramRunner( ProgramRunnerCommands.CLEAN_TRIMMOMATIC, [ input_, "%s/%s_cleaned.fastq" % (outdir, strip_ixes(input_)), window_size, quality, min_len ], { "exists": [outdir, input_], "positive": [window_size, quality, min_len] }, extraargstring) for input_ in inputs ], pool) printVerbose("Done cleaning sequences.") cleanup_pool(pool)
def assemble_pear(self, input_f, input_r, outdir, name, processes, pearthreads, extraargstring): """Uses PEAR to assemble paired F/R read files in run_parallel. :param input_f: File path to forward Fastq Reads file or folder. :param input_r: File path to reverse Fastq Reads file or folder. :param outdir: File path to the output directory. :param name: File prefix for the assembled reads. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. :param pearthreads: The number of threads per process to use. """ # "~/programs/pear-0.9.4-bin-64/pear-0.9.4-64 -f %s -r %s -o %s -j %s -m %d" inputs = validate_paired_fastq_reads(input_f, input_r) pool = init_pool(min(len(inputs), processes)) printVerbose("\tAssembling reads with pear") debugPrintInputInfo(inputs, "assemble") run_parallel([ProgramRunner(ProgramRunnerCommands.ASSEMBLE_PEAR, [forwards, reverse, "%s/%s_%s" % ( outdir, name, getFileName(forwards)), pearthreads], {"exists": [forwards, reverse], "positive": [pearthreads]}, extraargstring) for forwards, reverse in inputs], pool) printVerbose("Done assembling sequences...") # Grab all the auxillary files (everything not containing ".assembled." aux_files = getInputFiles(outdir, "*", "*.assembled.*", ignore_empty_files=False) # make aux dir for extraneous files and move them there bulk_move_to_dir(aux_files, makeAuxDir(outdir)) cleanup_pool(pool)
def parseVSearchOutputAgainstFasta(vsearch_outfile, taxInfo, output_file, min_simmilarity, min_coverage): """Resolves vsearch matches in a vsearch output file to the taxonomic name taken from BIOCODE. Takes in a vsearch output file from usearch__global, parses the result for good matches, and writes an output file mapping sequence name to taxa name. :param vsearch_out: An output file from vsearch's usearch__global program. :param taxInfo: A two column tabular file mapping BIOCODE sequence names to taxonomic identifier strings. :param output_file: Where to write the resulting file that maps sequence ID to taxanomic name. :param min_coverage: The minimum coverage for an acceptable vsearch match. :param min_similarity: The minimum simmilarity for an acceptable vsearch match. """ printVerbose("Parsing Vsearch Output") min_simm = float(min_simmilarity) min_coverage = float(min_coverage) biocodeTax = buildTaxaDict(taxInfo) printVerbose("Constructed identity dictionary with %d entries from %s." % (len(biocodeTax), taxInfo)) rslt = [] with open(output_file, 'w') as out: printVerbose("Reading %s as query file..." % vsearch_outfile) for line in open(vsearch_outfile, 'r'): data = line.split() if float(data[2]) > min_simm and float(data[4]) > min_coverage: if biocodeTax.has_key(data[1]): printVerbose("Found %s as %s" % (data[1], biocodeTax[data[1]])) data.append(biocodeTax[data[1]]) rslt.append("\t".join(data)) else: printErrorMissingID(out, data[1]) out.write("\n".join(rslt)) printVerbose("Wrote %d identified sequences to %s" % (len(rslt), output_file))
def demux_by_name(self, input_f, barcodes, outdir, filetype, processes, extraargstring): """Demuxes using SeqIO. :param input_f: File path to input file or folder of input files. :param barcodes: File path to input barcodes file. :param outdir: Filepath to output directory. :param filetype: Either 'fasta' or 'fastq'. :param processes: Number of processes to use to demux input fileset. :param extraargstring: Advanced program parameter string. """ aux_dir = makeAuxDir(outdir) # Get input files files_to_split = getInputFiles(input_f) # Assign the files shard numbers file_id = range(len(files_to_split)) file_id_pairs = zip(files_to_split, file_id) debugPrintInputInfo(files_to_split, "demux") pool = init_pool(min(len(file_id_pairs), processes)) printVerbose("Demuxing sequences...") run_parallel([PythonRunner(split_on_name, [input_, barcodes, outdir, id_, filetype], {"exists": [input_]}) for input_, id_ in file_id_pairs], pool) # Grab all the auxillary files aux_files = getInputFiles(outdir, "unmatched_*", ignore_empty_files=False) # make aux dir for extraneous files and move them there bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)
def rename_chewbacca(self, input_f, outdir, filetype, clip, processes): """Renames sequences in a fasta/fastq file as <filename>_ID0, <filename>_ID1, <filename>_ID2, etc., where <filename> is the name of the fasta/fastq file without any extensions or chewbacca suffixes. :param input_f: Filepath to an input file or folder to rename. :param outdir: Filepath to the output directory. :param filetype: Either 'fasta' or 'fastq'. :param clip: If True, remove dereplication counts from sequence names before renaming. :param processes: The maximum number of processes to use. """ # Gather input files inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "rename") pool = init_pool(min(len(inputs), processes)) printVerbose("Renaming sequences...") # Run serialRename in run_parallel run_parallel([PythonRunner(serialRename, [input_, "%s/%s_renamed%s" % (outdir, strip_ixes(input_), os.path.splitext(input_)[1]), filetype, clip], {"exists": [input_]}) for input_ in inputs], pool) printVerbose("Done renaming sequences...") samples_dir = makeDirOrdie("%s_samples" % outdir) samples_files = getInputFiles(outdir, "*.samples", ignore_empty_files=False) bulk_move_to_dir(samples_files, samples_dir) aux_dir = makeAuxDir(outdir) aux_files = getInputFiles(outdir, "*.mapping", ignore_empty_files=False) bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)
def main(argv): parser = argparse.ArgumentParser(description="arms description", epilog="arms long description") parser.add_argument('-v', '--version', action='version', version='%(prog)s '+version) parser.add_argument("--verbose", default=True, help="increase output verbosity", action="store_true") parser.add_argument('-t', '--threads', type=int, default = 1) parser.add_argument('--dryRun', action='store_true', default = False) subparsers = parser.add_subparsers(dest='action', help='Available commands') # preprocess data parser_preprocess = subparsers.add_parser('preprocess') parser_preprocess.add_argument('-n', '--name', required=True, help="Run Id") parser_preprocess.add_argument('-f', '--input_f', required=True, help="Forward Fastq Reads") parser_preprocess.add_argument('-r', '--input_r', required=True, help="Reverse Fastq Reads") parser_preprocess.add_argument('-b', '--barcodes', required=True, help="Tab delimted files of barcodes and their samples") parser_preprocess.add_argument('-o', '--outDir', required=True, help="Directory where outputs will be saved") parser_preprocess.add_argument('-d', '--db', required=True, help="Db against which the seqeunces are aligned") parser_preprocess.set_defaults(func=preprocessData) global args args = parser.parse_args() if args.verbose: logging.basicConfig(format=FORMAT, level=logging.DEBUG) else: logging.basicConfig(format=FORMAT, level=logging.ERROR) printVerbose.VERBOSE = args.verbose printVerbose("Running with %s threads" % args.threads) pool = Pool(processes=args.threads) logging.debug("Initial ARGS are:") logging.debug(args) args.func(args, pool)
def query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string, extraargstring, pool): """Runs a VSEARCH alignment on pairs of query/reference sequences. :param inputs: A list of pairs of (filepaths to) query_fastas and the refrence fastas to compare them to. :param outdir: Filepath to the directory where the alignment result should be written. :param aln_user_string: An optional string of commandline parameters passed to the VSEARCH program. :param simmilarity: The minimum simmilarity percentage (between reference and query sequences), \ as a decimal between 0 and 1), required for a positive match. :param processes: The number of processes to use in the identification process. :param extraargstring: Advanced program parameter string. :param pool: A fully initalized multiprocessing.Pool object. """ printVerbose("Aligning against reference sequences...") # # vsearch --usearch_global %s seeds.pick.fasta --db ../data/BiocodePASSED_SAP.txt --id 0.9 \ # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt run_parallel([ ProgramRunner(ProgramRunnerCommands.ALIGN_VSEARCH, [ processes, query_fasta, ref_fasta, simmilarity, "%s/%s.out" % (outdir, strip_ixes(query_fasta)), "%s/%s.alnout" % (outdir, strip_ixes(query_fasta)), aln_user_string ], { "exists": [query_fasta, ref_fasta], "positive": [processes] }, extraargstring) for query_fasta, ref_fasta in inputs ], pool) printVerbose("Done aligning.") return
def annotate_otu_chewbacca(self, input_f, outdir, annotation, processes): """Annotates an OTU table. :param input_f: Filepath to a file or folder of files to annotate. :param annotation: Filepath to a file or a folder of files to use as annotations. :param outdir: Filepath to the output directory where annotated files will be written. :param procs: The maximum number of procs to use. """ # matrixes = getInputFiles(input_f) matricies = getInputFiles(input_f) debugPrintInputInfo(matricies, "annotated.") annotations = getInputFiles(annotation) # if all the annotations files are empty, just copy over files. if len(annotations) == 0 and len(getInputFiles(annotation, ignore_empty_files=False)) > 0: pool = init_pool(min(len(matricies), processes)) print "**WARNING**: Annotation File is empty. Skipping annotation and copying old OTU tables to output \ directory.\n" run_parallel([PythonRunner(copy_file, [matrix, outdir], {"exists": [matrix]}) for matrix in matricies], pool) else: pool = init_pool(min(len(matricies) * len(annotations), processes)) debugPrintInputInfo(annotations, "parsed.") inputs = product(matricies, annotations) printVerbose("Annotating matrix...") run_parallel([PythonRunner(annotateOTUtable, [matrix, annotation, "%s/%s.txt" % (outdir, "matrix")], {"exists": [matrix, annotation]}) for matrix, annotation in inputs], pool) printVerbose("Done Annotating.") cleanup_pool(pool)
def clean_quality_trimmomatic(self, input_f, outdir, window_size, quality, min_len, processes, extraargstring): """Uses a sliding window to identify and trim away areas of low quality. :param input_f: Filepath to input file or folder. :param outdir: Filepath to the output directory. :param window_size: Width of the sliding window. (Number of consecutive base-pairs to average for quality \ analysis). :param quality: Minimum quality allowed. Sections with lower average quality than this will be dropped. :param min_len: Minimum allowed length for TRIMMED sequences. (i.e. if a sequence is too short after trimming, its dropped.) :param processes: Number of processes to use to clean the input fileset. """ # "trimomatic": "java -jar ~/ARMS/programs/Trimmomatic-0.33/trimmomatic-0.33.jar SE \ # -%phred %input %output SLIDINGWINDOW:%windowsize:%minAvgQuality MINLEN:%minLen" inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "clean") pool = init_pool(min(len(inputs), processes)) printVerbose("Cleaning sequences with Trimmomatic...") run_parallel([ProgramRunner(ProgramRunnerCommands.CLEAN_TRIMMOMATIC, [input_, "%s/%s_cleaned.fastq" % (outdir, strip_ixes(input_)), window_size, quality, min_len], {"exists": [outdir, input_], "positive": [window_size, quality, min_len]}, extraargstring) for input_ in inputs], pool) printVerbose("Done cleaning sequences.") cleanup_pool(pool)
def preclean_bayeshammer(self, input_f, input_r, outdir, processes, bayesthreads, extraargstring): """Assembles reads from two (left and right) fastq files/directories. :param input_f: File path to file or folder of left reads to clean. :param input_r: File path to file or folder of right reads to clean. :param outdir: Filepath to output directory. :param bayesthreads: The number of threads per process to use. :param processes: The maximum number of processes to use. :param kmerlen: The kmer length to use. Default: 16. :param extraargstring: Advanced program parameter string. """ # Collect input files, and validate that they match inputs = validate_paired_fastq_reads(input_f, input_r) pool = init_pool(min(len(inputs), processes)) printVerbose("\tPrecleaning %s reads with Spades-Baye's Hammer..." % len(inputs)) debugPrintInputInfo(inputs, "preclean/fix.") run_parallel([ ProgramRunner(ProgramRunnerCommands.PRECLEAN_SPADES, [forwards, reverse, outdir, bayesthreads], { "exists": [forwards, reverse], "positive": [bayesthreads] }, extraargstring) for forwards, reverse in inputs ], pool) printVerbose("Done cleaning reads.") # Grab all the auxillary files (everything not containing ".assembled." # aux_files = getInputFiles(outdir, "*", "*.assembled.*", ignore_empty_files=False) # make aux dir for extraneous files and move them there # bulk_move_to_dir(aux_files, makeAuxDir(outdir)) # Select output files aux_files = getInputFiles(outdir, "*", ignore_empty_files=False) corrected_dir = "%s/corrected" % outdir bulk_move_to_dir(getInputFiles(corrected_dir, "*"), outdir) aux_files += getInputFiles(outdir, "*unpaired*", ignore_empty_files=False) aux_files += getInputFiles(outdir, "configs", ignore_empty_files=False) # Gather aux files aux_dir = makeAuxDir(outdir) bulk_move_to_dir(aux_files, aux_dir) # Rename output files output_files = getInputFiles(outdir, "*", "corrected_*") for out_file in output_files: move(out_file, "%s/%s_corrected.fastq" % (outdir, strip_ixes(out_file))) # move the last minute log file try: move("%s/corrected_corrected.fastq" % outdir, "%s/corrected_corrected.fastq" % aux_dir) except: pass cleanup_pool(pool)
def query_fasta_vsearch(self, input_f, referencefasta, taxinfo, outdir, processes, simmilarity, coverage, extraargstring): """Compare reference sequences to the fasta-formatted query sequences, using global pairwise alignment. :param input_f: Filepath to a file or folder of files to identify. :param outdir: Filepath to the output directory. :param referencefasta: Filepath to a file or folder of files to use as a reference. :param taxinfo: Filepath to a file containing taxonomic info correlated with the referencefasta. :param simmilarity: The % simmilarity between a query and reference sequence required for positive identification. :param coverage: The % coverage of matching regions between a query and reference sequence required for positive identification. :param processes: The number of processes to use in the identification process. :param extraargstring: Advanced program parameter string. """ # vsearch --usearch_global %s seeds.pick.fasta --db ../data/BiocodePASSED_SAP.txt --id 0.9 \ # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt # expecting a fasta to annotate query_fastas = getInputFiles(input_f) debugPrintInputInfo(query_fastas, "queried for identification.") ref_fastas = getInputFiles(referencefasta) debugPrintInputInfo(ref_fastas, "referenced for sequence identification.") tax_info_files = getInputFiles(taxinfo) debugPrintInputInfo(tax_info_files, "referenced for taxanomic names.") # make sure the number of reference fasta files is the same as the number of tax_info files if len(tax_info_files) != len(ref_fastas): print "Error: The number of reference fastas and taxonomic mapping files is not the same. There must be \ one taxonomic mapping file for each reference fasta." return ref_data_pairs = zip(ref_fastas, tax_info_files) inputs = [x for x in product(query_fastas, ref_fastas)] aln_user_string = "" pool = init_pool(min(len(inputs), processes)) # VSEARCH ALIGNMENT query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string, extraargstring, pool) printVerbose("Parsing output...") # Parse the alignment results and put those that pass the criterion (97 similarity, 85 coverage) in # parsed_BIOCODE.out. Parameters can be changed and this command can be rerun as many times as necessary # # parseVSearchOutputAgainstFasta(vsearch_outfile, taxInfo, output_file, min_simmilarity, min_coverage): inputs = [x for x in product(query_fastas, ref_data_pairs)] debugPrintInputInfo(inputs, "queryied against paired refereces.") run_parallel([PythonRunner(parseVSearchOutputAgainstFasta, ["%s/%s.out" % (outdir, strip_ixes(query)), tax_info, "%s/%s.tax" % (outdir, strip_ixes(query)), simmilarity, coverage], {"exists": [query, ref_fasta, tax_info]}) for query, (ref_fasta, tax_info) in inputs], pool) printVerbose("\nDone parsing...") # Gather and move auxillary files aux_files = getInputFiles(outdir, "*", "*.tax", ignore_empty_files=False) bulk_move_to_dir(aux_files, makeAuxDir(outdir)) cleanup_pool(pool)
def query_fasta_db_vsearch(self, input_f, outdir, ref_fasta, ref_db, simmilarity, coverage, processes, extraargstring): """Compare reference sequences to the fasta-formatted query sequences, using global pairwise alignment. :param input_f: Filepath to a file or folder of files to identify. :param outdir: Filepath to the output directory. :param ref_fasta: Filepath to the curated fasta file to use as a reference. :param ref_db: Filepath to the curated fasta file to use as a reference. :param simmilarity:"Minimum % simmilarity (decimal between 0 and 1) between query and reference sequences required for positive identification. :param coverage:Minimum % coverage (decimal between 0 and 1) required query and reference sequences required for positive identification. :param processes: The number of processes to use in the identification process. :param extraargstring: Advanced program parameter string. """ # blast6 output format http://www.drive5.com/usearch/manual/blast6out.html aln_user_string = "--userfields query+target+id+alnlen+qcov" # coi_fasta = os.path.expanduser("~/ARMS/refs/COI.fasta") # ncbi_db_string = os.path.expanduser("~/ARMS/refs/ncbi.db") coi_fasta = ref_fasta ncbi_db_string = ref_db query_fastas = getInputFiles(input_f) debugPrintInputInfo(query_fastas, "queried against the DB.") inputs = [x for x in product(query_fastas, [coi_fasta])] pool = init_pool(min(len(query_fastas), processes)) # VSEARCH ALIGNMENT query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string, extraargstring, pool) printVerbose("Parsing output...") # Parse the alignment results and put those that pass the criterion (97 similarity, 85 coverage) in # parsed_BIOCODE.out. Parameters can be changed and this command can be rerun as many times as necessary # # parseVSearchOutputAgainstNCBI(vsearch_out, ncbi_db, min_coverage, min_similarity)> parsed_nt.out run_parallel([ PythonRunner(parseVSearchOutputAgainstNCBI, [ "%s/%s.out" % (outdir, strip_ixes(query)), ncbi_db_string, "%s/%s.tax" % (outdir, strip_ixes(query)), simmilarity, coverage ], {"exits": [query, ncbi_db_string]}) for query in query_fastas ], pool) printVerbose("Done processing.") # Gather and move auxillary files aux_files = getInputFiles(outdir, "*", "*.tax", ignore_empty_files=False) bulk_move_to_dir(aux_files, makeAuxDir(outdir)) cleanup_pool(pool)
def convert_chewbacca(self, input_f, outdir, proceses): inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "convert to fasta.") printVerbose("Converting to fasta...") pool = init_pool(min(len(inputs), proceses)) run_parallel([PythonRunner(translateFastqToFasta, [input_, "%s/%s.fasta" % (outdir, getFileName(input_))], {"exists": input_}) for input_ in inputs], pool) printVerbose("Done converting.") cleanup_pool(pool)
def convert_chewbacca(self, input_f, outdir, proceses): inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "convert to fasta.") printVerbose("Converting to fasta...") pool = init_pool(min(len(inputs), proceses)) run_parallel([ PythonRunner( translateFastqToFasta, [input_, "%s/%s.fasta" % (outdir, getFileName(input_))], {"exists": input_}) for input_ in inputs ], pool) printVerbose("Done converting.") cleanup_pool(pool)
def parseVSearchOutputAgainstNCBI(vsearch_out, database, output_file, min_coverage, min_similarity): """Resolves vsearch matches in a vsearch output file to the taxonomic name taken from BOLD. Takes in a vsearch output file from usearch__global, parses the result for good matches, and writes an output file mapping sequence name to taxa name. :param vsearch_out: An output file from vsearch's usearch__global program. :param database: The database used as part of the vsearch usearch__global operation. :param output_file: Where to write the resulting file that maps sequence ID to taxanomic name. :param min_coverage: The minimum coverage for an acceptable vsearch match. :param min_similarity: The minimum simmilarity for an acceptable vsearch match. """ min_simm = float(min_similarity) min_coverage = float(min_coverage) ncbi = NCBITaxa() conn = sqlite3.connect(database) c = conn.cursor() query = "select taxid from gi_taxid where gi=%s" def getTaxFromId(taxId, taxonomy=[ "species", "genus", 'family', 'order', 'class', 'phylum' ]): myTaxonomy = dict([(a, "") for a in taxonomy]) taxId = int(taxId) for lin in ncbi.get_lineage(taxId): rank = ncbi.get_rank([lin]).values()[0] if rank in taxonomy: val = ncbi.get_taxid_translator([lin]).values()[0] myTaxonomy[rank] = val return ":".join([myTaxonomy[x] for x in taxonomy[::-1]]) with open(output_file, 'w') as out: for line in open(vsearch_out, 'r'): data = line.split() if float(data[4]) > min_coverage or float(data[2]) > min_simm: hit = c.execute(query % data[1]).fetchone() if hit: taxonomy = getTaxFromId(hit[0]) data.append(taxonomy) printVerbose("\t".join(data)) out.write("\t".join(data)) out.write("\n") else: printErrorMissingID(out, data[1])
def clean_trim_adapters_flexbar(self, input_f, adapters, adaptersrc, outdir, allowedns, processes, extraargstring): """Use flexbar to trim adapters and barcodes from sequences. By default, Flexbar does not allow any 'N' \ characters in SEQUENCE, and will toss any sequences that do contain 'N'. To avoid this, use the -u or \ --allowedns flags to specify the maximum number of 'N's to allow :param input_f: Filepath to input file or folder. :param adapters: Filepath to a list of adapters. :param adaptersrc: Filepath to a list of reverse-complemented adapters. :param outdir: Filepath to the output directory. :param allowedns: Non-negative integer value indicating the maximum number of 'N's to tolerate in a sequence. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ inputs = getInputFiles(input_f) pool = init_pool(min(len(inputs), processes)) debugPrintInputInfo(inputs, "trim adapters from") # "flexbar": "flexbar -r \"%s\" -t \"%s\" -ae \"%s\" -a \"%s\"", printVerbose("Trimming barcodes and adapters with flexbar") temp_file_name_template = "%s/temp_%s" debarcoded_file_name_template = "%s/%s_debarcoded" # Trim adapters from the left run_parallel([ ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [ input_file, temp_file_name_template % (outdir, strip_ixes(input_file)), "LEFT", adapters, allowedns ], {"exists": [input_file, adapters]}, extraargstring) for input_file in inputs ], pool) temp_files = getInputFiles(outdir, "temp_*") debugPrintInputInfo(temp_files, "trim adapters from") # Trim the reverse complemented adapters from the right run_parallel([ ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [ input_file, debarcoded_file_name_template % (outdir, strip_ixes(input_file)[5:]), "RIGHT", adaptersrc, allowedns ], {"exists": [input_file, adaptersrc]}, extraargstring) for input_file in temp_files ], pool) printVerbose("Done Trimming sequences.") # Move temp files aux_files = getInputFiles(outdir, "temp_*", ignore_empty_files=False) bulk_move_to_dir(aux_files, makeAuxDir(outdir)) cleanup_pool(pool)
def annotateOTUtable(otu_file, annotation_file, out_file, id_col=0, tax_col=6, clip_count_from_annotations=True): """Given the best hits from a cleaned, annotated (with taxonomic names), Vsearch out file, renames each sequence ID in OTU table with its taxonomic name in the Vsearch outfile. :param otu_file: The otu file to annotate. :param annotation_file: Vsearch output file, cleaned by parse.ParseVsearchOutForTaxa.py :param out_file: Filepath to write the resulting annotated OTU file. :param id_col: The column number (zero-indexed) in the Vsearch file containing sequence fasta IDs. :param tax_col: The column number (zero-indexed) in the Vsearch file containing the taxonomic IDs. :param clip_count_from_annotations: If True, clip dereplication counts from Vsearch sequence fasta IDs. Default: True. :return: Filepath to the resulting annotated OTU table. """ # CREATE A DICTIONARY MAPPING SEQUENCE ID TO TAXONOMIC NAMES id_to_tax = {} for line in open(annotation_file, 'r'): data = line.split("\t") id = data[id_col].rstrip() if clip_count_from_annotations: id = clip_count(id, '_') tax = data[tax_col].rstrip() id_to_tax[id] = tax identifiable = id_to_tax.keys() printVerbose("Constructed a dictionary of %d identities from %s." % (len(id_to_tax), annotation_file)) # Parse through the matrix file again to replace any identifiable ids and reformat the file printVerbose("Annotating %s with found identities." % otu_file) with open(out_file, 'w') as out: for line in open(otu_file, 'r'): line = line.rstrip() current_id = line.split()[id_col] if current_id in identifiable: tax = id_to_tax[current_id] out_line = "%s\t%s\n" % (line, "".join(tax.split())) out.write(out_line) else: out_line = "%s\tUnclassified\n" % line out.write(out_line) return out_file
def splitK(inputFasta, prefix, nbSeqsPerFile, filetype): mySeqs = SeqIO.parse(inputFasta, filetype) chunk = 0 sequences = [] for mySeq in mySeqs: mySeq.seq = mySeq.seq.ungap(".") if len(mySeq.seq) < 200: continue sequences.append(mySeq) if len(sequences) % nbSeqsPerFile == 0: SeqIO.write(sequences, open("%s_part_%d.%s" % (str(prefix), chunk, filetype), 'w'), filetype) sequences=[] chunk+=1 if sequences: SeqIO.write(sequences, open("%s_part_%d.%s" % (str(prefix), chunk, filetype), 'w'), filetype) printVerbose("Split %s into %d parts." % (inputFasta, (chunk + 1)))
def get_best_hits_from_vsearch(input_fna, ref_fna, outdir, id_pct=.7): """Calls vsearch with an input fasta, and returns a dictionary mapping each sequence to its best hit. (subject to the ID threshold (70%) in vsearch (See ProgramRunnerCommands.ALIGN_VSEARCH). :param input_fna: string. Filepath to the input fna fasta file. :param ref_fna: string. Filepath to the reference fna fasta file. :param outdir: string. Filepath to the output directory for the hits file. :return: {string:string} A dictionary mapping input sequence names to the best hit in the reference DB. """ def best_hits_from_vsearch(v_search_output): best_hits = {} for line in open(v_search_output, 'r'): data = line.split("\t") query_name = data[0].rstrip() if best_hits.has_key(query_name): if float(best_hits[query_name][2].rstrip()) < float( data[2].rstrip()): best_hits[query_name] = data else: best_hits[query_name] = data return best_hits threads = 1 pool = init_pool(threads) #printVerbose.VERBOSE = True print "calling vsearch" processes = 1 aln_user_string = "" extraargstring = "" printVerbose("Aligning against reference sequences...") # # vsearch --usearch_global %s seeds.pick.fasta --db ../data/BiocodePASSED_SAP.txt --id 0.9 \ # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt ProgramRunner(ProgramRunnerCommands.ALIGN_VSEARCH, [ processes, input_fna, ref_fna, id_pct, "%s/%s.out" % (outdir, strip_ixes(input_fna)), "%s/%s.alnout" % (outdir, strip_ixes(input_fna)), aln_user_string ], { "exists": [input_fna, ref_fna], "positive": [processes] }, extraargstring).run() vsearch_output = "%s/%s.out" % (outdir, strip_ixes(input_fna)) # Choose the best hit return best_hits_from_vsearch(vsearch_output)
def visualize_otu_heatmap(self, data_frame, output_file): """Visualizes an OTU table as a heatmap, showing OTU abundance in each Sample. :param data_frame: A pandas dataframe to graph. :param output_file: Filepath to the output graphics file. """ ncols = len(data_frame.columns.values) nrows = len(data_frame.index.values) printVerbose("Computing dataframe values...") fig, ax = plt.subplots(figsize=(10 + ncols / .5, 7 + nrows / 10.0)) heatmap = ax.pcolor(data_frame, cmap=plt.cm.binary) ax.set_xticks(np.arange(ncols) + 0.5) ax.set_yticks(np.arange(nrows) + 0.5) ax.set_xticklabels(data_frame.columns.values, rotation=90, fontsize=4) ax.set_yticklabels(data_frame.index.values, fontsize=4) cbar = plt.colorbar(heatmap) printVerbose("Saving image %s..." % output_file) plt.savefig(output_file, dpi=200)
def parseVSearchOutputAgainstNCBI(vsearch_out, database, output_file, min_coverage, min_similarity): """Resolves vsearch matches in a vsearch output file to the taxonomic name taken from BOLD. Takes in a vsearch output file from usearch__global, parses the result for good matches, and writes an output file mapping sequence name to taxa name. :param vsearch_out: An output file from vsearch's usearch__global program. :param database: The database used as part of the vsearch usearch__global operation. :param output_file: Where to write the resulting file that maps sequence ID to taxanomic name. :param min_coverage: The minimum coverage for an acceptable vsearch match. :param min_similarity: The minimum simmilarity for an acceptable vsearch match. """ min_simm = float(min_similarity) min_coverage = float(min_coverage) ncbi = NCBITaxa() conn = sqlite3.connect(database) c = conn.cursor() query = "select taxid from gi_taxid where gi=%s" def getTaxFromId(taxId, taxonomy=["species", "genus", 'family', 'order', 'class', 'phylum']): myTaxonomy = dict([(a, "") for a in taxonomy]) taxId = int(taxId) for lin in ncbi.get_lineage(taxId): rank = ncbi.get_rank([lin]).values()[0] if rank in taxonomy: val = ncbi.get_taxid_translator([lin]).values()[0] myTaxonomy[rank] = val return ":".join([myTaxonomy[x] for x in taxonomy[::-1]]) with open(output_file, 'w') as out: for line in open(vsearch_out, 'r'): data = line.split() if float(data[4]) > min_coverage or float(data[2]) > min_simm: hit = c.execute(query % data[1]).fetchone() if hit: taxonomy = getTaxFromId(hit[0]) data.append(taxonomy) printVerbose("\t".join(data)) out.write("\t".join(data)) out.write("\n") else: printErrorMissingID(out, data[1])
def get_best_hits_from_vsearch(input_fna, ref_fna, outdir, id_pct=.7): """Calls vsearch with an input fasta, and returns a dictionary mapping each sequence to its best hit. (subject to the ID threshold (70%) in vsearch (See ProgramRunnerCommands.ALIGN_VSEARCH). :param input_fna: string. Filepath to the input fna fasta file. :param ref_fna: string. Filepath to the reference fna fasta file. :param outdir: string. Filepath to the output directory for the hits file. :return: {string:string} A dictionary mapping input sequence names to the best hit in the reference DB. """ def best_hits_from_vsearch(v_search_output): best_hits = {} for line in open(v_search_output, 'r'): data = line.split("\t") query_name = data[0].rstrip() if best_hits.has_key(query_name): if float(best_hits[query_name][2].rstrip()) < float(data[2].rstrip()): best_hits[query_name] = data else: best_hits[query_name] = data return best_hits threads = 1 pool = init_pool(threads) #printVerbose.VERBOSE = True print "calling vsearch" processes=1 aln_user_string="" extraargstring="" printVerbose("Aligning against reference sequences...") # # vsearch --usearch_global %s seeds.pick.fasta --db ../data/BiocodePASSED_SAP.txt --id 0.9 \ # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt ProgramRunner(ProgramRunnerCommands.ALIGN_VSEARCH, [processes, input_fna, ref_fna, id_pct, "%s/%s.out" % (outdir, strip_ixes(input_fna)), "%s/%s.alnout" % (outdir, strip_ixes(input_fna)), aln_user_string], {"exists": [input_fna, ref_fna], "positive": [processes]}, extraargstring).run() vsearch_output = "%s/%s.out" % (outdir, strip_ixes(input_fna)) # Choose the best hit return best_hits_from_vsearch(vsearch_output)
def annotateOTUtable(otu_file, annotation_file, out_file, id_col=0, tax_col=6, clip_count_from_annotations=True): """Given the best hits from a cleaned, annotated (with taxonomic names), Vsearch out file, renames each sequence ID in OTU table with its taxonomic name in the Vsearch outfile. :param otu_file: The otu file to annotate. :param annotation_file: Vsearch output file, cleaned by parse.ParseVsearchOutForTaxa.py :param out_file: Filepath to write the resulting annotated OTU file. :param id_col: The column number (zero-indexed) in the Vsearch file containing sequence fasta IDs. :param tax_col: The column number (zero-indexed) in the Vsearch file containing the taxonomic IDs. :param clip_count_from_annotations: If True, clip dereplication counts from Vsearch sequence fasta IDs. Default: True. :return: Filepath to the resulting annotated OTU table. """ # CREATE A DICTIONARY MAPPING SEQUENCE ID TO TAXONOMIC NAMES id_to_tax = {} for line in open(annotation_file, 'r'): data = line.split("\t") id = data[id_col].rstrip() if clip_count_from_annotations: id = clip_count(id,'_') tax = data[tax_col].rstrip() id_to_tax[id] = tax identifiable = id_to_tax.keys() printVerbose("Constructed a dictionary of %d identities from %s." % (len(id_to_tax), annotation_file)) # Parse through the matrix file again to replace any identifiable ids and reformat the file printVerbose("Annotating %s with found identities." % otu_file) with open(out_file, 'w') as out: for line in open(otu_file, 'r'): line = line.rstrip() current_id = line.split()[id_col] if current_id in identifiable: tax = id_to_tax[current_id] out_line = "%s\t%s\n" % (line, "".join(tax.split())) out.write(out_line) else: out_line = "%s\tUnclassified\n" % line out.write(out_line) return out_file
def clean_trim_adapters_flexbar(self, input_f, adapters, adaptersrc, outdir, allowedns, processes, extraargstring): """Use flexbar to trim adapters and barcodes from sequences. By default, Flexbar does not allow any 'N' \ characters in SEQUENCE, and will toss any sequences that do contain 'N'. To avoid this, use the -u or \ --allowedns flags to specify the maximum number of 'N's to allow :param input_f: Filepath to input file or folder. :param adapters: Filepath to a list of adapters. :param adaptersrc: Filepath to a list of reverse-complemented adapters. :param outdir: Filepath to the output directory. :param allowedns: Non-negative integer value indicating the maximum number of 'N's to tolerate in a sequence. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ inputs = getInputFiles(input_f) pool = init_pool(min(len(inputs), processes)) debugPrintInputInfo(inputs, "trim adapters from") # "flexbar": "flexbar -r \"%s\" -t \"%s\" -ae \"%s\" -a \"%s\"", printVerbose("Trimming barcodes and adapters with flexbar") temp_file_name_template = "%s/temp_%s" debarcoded_file_name_template = "%s/%s_debarcoded" # Trim adapters from the left run_parallel([ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [input_file, temp_file_name_template % (outdir, strip_ixes(input_file)), "LEFT", adapters, allowedns], {"exists": [input_file, adapters]}, extraargstring) for input_file in inputs], pool) temp_files = getInputFiles(outdir, "temp_*") debugPrintInputInfo(temp_files, "trim adapters from") # Trim the reverse complemented adapters from the right run_parallel([ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [input_file, debarcoded_file_name_template % (outdir, strip_ixes(input_file)[5:]), "RIGHT", adaptersrc, allowedns], {"exists": [input_file, adaptersrc]}, extraargstring) for input_file in temp_files], pool) printVerbose("Done Trimming sequences.") # Move temp files aux_files = getInputFiles(outdir, "temp_*", ignore_empty_files=False) bulk_move_to_dir(aux_files, makeAuxDir(outdir)) cleanup_pool(pool)
def annotate_otu_chewbacca(self, input_f, outdir, annotation, processes): """Annotates an OTU table. :param input_f: Filepath to a file or folder of files to annotate. :param annotation: Filepath to a file or a folder of files to use as annotations. :param outdir: Filepath to the output directory where annotated files will be written. :param procs: The maximum number of procs to use. """ # matrixes = getInputFiles(input_f) matricies = getInputFiles(input_f) debugPrintInputInfo(matricies, "annotated.") annotations = getInputFiles(annotation) # if all the annotations files are empty, just copy over files. if len(annotations) == 0 and len( getInputFiles(annotation, ignore_empty_files=False)) > 0: pool = init_pool(min(len(matricies), processes)) print "**WARNING**: Annotation File is empty. Skipping annotation and copying old OTU tables to output \ directory.\n" run_parallel([ PythonRunner(copy_file, [matrix, outdir], {"exists": [matrix]}) for matrix in matricies ], pool) else: pool = init_pool(min(len(matricies) * len(annotations), processes)) debugPrintInputInfo(annotations, "parsed.") inputs = product(matricies, annotations) printVerbose("Annotating matrix...") run_parallel([ PythonRunner( annotateOTUtable, [matrix, annotation, "%s/%s.txt" % (outdir, "matrix")], {"exists": [matrix, annotation]}) for matrix, annotation in inputs ], pool) printVerbose("Done Annotating.") cleanup_pool(pool)
def ungap_chewbacca(self, input_f, outdir, gapchars, file_ext, processes): """Ungaps a character using Bio python. :param input_f: Filepath to input file or folder to ungap. :param outdir: Filepath to the output directory where ungapped files should be written. :param gapchars: A string containing the gap characters to remove. :param file_ext: Either 'fasta' or 'fastq'. :param processes: The number of threads to use to ungap the input fileset. """ inputs = getInputFiles(input_f, "*.fasta") debugPrintInputInfo(inputs, "ungap.") pool = init_pool(min(len(inputs), processes)) printVerbose("Removing all '%s' from sequences..." % gapchars) # ungap(file_to_clean, output_file_name, gap_char, file_type): run_parallel([PythonRunner(remove_gap_chars, [input_, "%s/%s_cleaned.%s" % (outdir, strip_ixes(input_), 'fasta'), gapchars, file_ext], {"exists": [input_]}) for input_ in inputs], pool) printVerbose("Done removing.") cleanup_pool(pool)
def partition_chewbacca(self, input_f, outdir, processes, chunksize, filetype): """Partition a fasta/fastq file into chunks of user-defined size. :param input_f: Filepath to a file or folder of files to partition. :param outdir: The directory to write split files to. :param processes: The number of processes to use to partition the input fileset. :param chunksize: The number of sequences per file. :param filetype: Either 'fasta' or 'fastq'. """ # Gather input files inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "partitioned") pool = init_pool(min(len(inputs), processes)) printVerbose("Partitioning Files...") run_parallel([PythonRunner(splitK, [input_, "%s/%s" % (outdir, strip_ixes(input_)), chunksize, filetype], {"exists": [input_]}) for input_ in inputs], pool) printVerbose("Done partitioning files.") cleanup_pool(pool)
def get_best_hits_from_vsearch(input_fna, ref_fna, outdir): def best_hits_from_vsearch(v_search_output): best_hits = {} for line in open(v_search_output, 'r'): data = line.split("\t") query_name = data[0].rstrip() if best_hits.has_key(query_name): if float(best_hits[query_name][2].rstrip()) < float(data[2].rstrip()): best_hits[query_name] = data else: best_hits[query_name] = data return best_hits threads = 1 pool = init_pool(threads) #printVerbose.VERBOSE = True print "calling vsearch" # Search for good hits inputs = [(input_fna, ref_fna)] processes=1 aln_user_string="" extraargstring="" printVerbose("Aligning against reference sequences...") # # vsearch --usearch_global %s seeds.pick.fasta --db ../data/BiocodePASSED_SAP.txt --id 0.9 \ # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt ProgramRunner(ProgramRunnerCommands.ALIGN_VSEARCH, [processes, input_fna, ref_fna, "%s/%s.out" % (outdir, strip_ixes(input_fna)), "%s/%s.alnout" % (outdir, strip_ixes(input_fna)), aln_user_string], {"exists": [input_fna, ref_fna], "positive": [processes]}, extraargstring).run() print "cleaning up." vsearch_output = "%s/%s.out" % (outdir, strip_ixes(input_fna)) # Choose the best hit return best_hits_from_vsearch(vsearch_output)
def split_on_name(input_f, barcodes_file, outdir, id_, filetype): """Demuxes a single input file into separate files. Matches unique strings in sequence names to the unique sample names in the barcodes_file. :param input_f: Filepath to the fasta file to demux. :param barcodes_file: Filepath tot he barcodes file to parse. Note: only the sequence names are read. Barcode data can be faked. :param outdir: Filepath to the output directory where demuxed fasta files should be written. :param id_: A unique integer id for a fasta file to demux. :param filetype: Either 'fasta' or 'fastq' indicating the input and output filetypes. """ unmatched_name = "unmatched" sample_names = parse_barcodes_to_dict(barcodes_file).keys() sample_names.append(unmatched_name) sample_names.sort(reverse=True) printVerbose("Possible samples:") printVerbose(str(sample_names)) out_streams = {} for sample_name in sample_names: outfile = "%s/%s_%s_splitOut.fastq" % ( outdir, sample_name, id_) out_streams[sample_name] = BufferedSeqWriter(outfile, filetype) SeqIO.parse(input_f, filetype) seq_dict = SeqIO.index(input_f, filetype) for name in seq_dict.keys(): matched = False for sample_name in sample_names: if sample_name in name: out_streams[sample_name].write(seq_dict[name]) matched = True break if not matched: out_streams[unmatched_name].write(seq_dict[name]) for writer in out_streams.keys(): out_streams[writer].flush() # Remove empty files if out_streams[writer].empty: out_streams[writer].delete()
def build_otu_chewbacca(self, outdir, groups_file, samples_file, barcodes_file): """Builds the unannotated OTU table using custom chewbacca script. :param outdir: The directory where the matrix should be written. :param groups_file: A .groups file containing the OTU names and their consituent/replicant sequences. :param samples_file: A .samples file containing the samples that each sequence in the .groups file belongs to. :param barcodes_file: A .barcodes file listing all sample names. :param extraargstring: Advanced program parameter string. """ groups = getInputFiles(groups_file) debugPrintInputInfo(groups, "read.") samples = getInputFiles(samples_file) debugPrintInputInfo(samples, "read.") barcodes = getInputFiles(barcodes_file) debugPrintInputInfo(barcodes, "read.") printVerbose("Building matrix...") buildOTUtable(groups, samples, barcodes[0], "%s/%s.txt" % (outdir, "matrix")) printVerbose("Done building.")
def parseGroupsFileToDict(groups_file, thing_to_map): """Given a .groups file, returns a dictionary mapping each seed to either a count of its children, or a space-delimited string of its children's names. :param groups_file: A .groups file. :param thing_to_map: Specify 'children' to map seed names to a space-delimited string of children names, or 'counts' to map seed names to a count of children. :return: A dictionary mapping each seed to either a count of its children, or a space-delimited string of its children's names """ groups = {} printVerbose("Reading count file: %s" % groups_file) # collect the seed names, and the children sequence names i = 0 nb_lines = 0 for line in open(groups_file, 'r'): nb_lines +=1 data = line.rstrip().split("\t") seed = data[0] children = "" if thing_to_map == "children": if len(data) > 1: children = ' '.join(list(set(data[1:]))) groups[seed] = children if thing_to_map == "counts": if len(data) > 1: children = data[1] groups[seed] = len(children.split(" ")) if nb_lines % 100000 == 0: printVerbose("%s lines processed" % nb_lines) printVerbose("Done reading count file.") return groups
def handle_groups_file_update(outdir, groupsfile, clustering_groups_files_uncount): """Checks if the user specified groups file exists, and updates the groupings with clustering data. Returns a list of the most up to date groups files. :param outdir: Filepath to the directory where outputfiles will be written. :param groupsfile: Optional filepath to the .groups file, or a folder of .groups files to use as a reference. :param clustering_groups_files_uncount: The output groups file from clustering, with trailing replication counts removed from sequence names. Names in this file should match those used in the user-specified groups file groupsfile. :return: A list of filenames pointing to the most up to date groups files. """ most_recent_groups_files = clustering_groups_files_uncount if groupsfile: # Try to grab groups files user_specified_groups_files = getInputFiles(groupsfile, critical=False) # If we have files at the given location if len(user_specified_groups_files) != 0: most_recent_groups_files = user_specified_groups_files printVerbose("Updating .groups files with clustering data") debugPrintInputInfo(most_recent_groups_files, "used as groups references") update_groups(most_recent_groups_files, clustering_groups_files_uncount, outdir, "postcluster") printVerbose("Done updating .groups files.") most_recent_groups_files = getInputFiles(outdir, "postcluster*.groups") else: printVerbose("No name files provided, assuming singletons...\n") return most_recent_groups_files
def parseGroupsFileToDict(groups_file, thing_to_map): """Given a .groups file, returns a dictionary mapping each seed to either a count of its children, or a space-delimited string of its children's names. :param groups_file: A .groups file. :param thing_to_map: Specify 'children' to map seed names to a space-delimited string of children names, or 'counts' to map seed names to a count of children. :return: A dictionary mapping each seed to either a count of its children, or a space-delimited string of its children's names """ groups = {} printVerbose("Reading count file: %s" % groups_file) # collect the seed names, and the children sequence names i = 0 nb_lines = 0 for line in open(groups_file, 'r'): nb_lines += 1 data = line.rstrip().split("\t") seed = data[0] children = "" if thing_to_map == "children": if len(data) > 1: children = ' '.join(list(set(data[1:]))) groups[seed] = children if thing_to_map == "counts": if len(data) > 1: children = data[1] groups[seed] = len(children.split(" ")) if nb_lines % 100000 == 0: printVerbose("%s lines processed" % nb_lines) printVerbose("Done reading count file.") return groups
def rename_chewbacca(self, input_f, outdir, filetype, clip, processes): """Renames sequences in a fasta/fastq file as <filename>_ID0, <filename>_ID1, <filename>_ID2, etc., where <filename> is the name of the fasta/fastq file without any extensions or chewbacca suffixes. :param input_f: Filepath to an input file or folder to rename. :param outdir: Filepath to the output directory. :param filetype: Either 'fasta' or 'fastq'. :param clip: If True, remove dereplication counts from sequence names before renaming. :param processes: The maximum number of processes to use. """ # Gather input files inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "rename") pool = init_pool(min(len(inputs), processes)) printVerbose("Renaming sequences...") # Run serialRename in run_parallel run_parallel([ PythonRunner(serialRename, [ input_, "%s/%s_renamed%s" % (outdir, strip_ixes(input_), os.path.splitext(input_)[1]), filetype, clip ], {"exists": [input_]}) for input_ in inputs ], pool) printVerbose("Done renaming sequences...") samples_dir = makeDirOrdie("%s_samples" % outdir) samples_files = getInputFiles(outdir, "*.samples", ignore_empty_files=False) bulk_move_to_dir(samples_files, samples_dir) aux_dir = makeAuxDir(outdir) aux_files = getInputFiles(outdir, "*.mapping", ignore_empty_files=False) bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)
def partition_chewbacca(self, input_f, outdir, processes, chunksize, filetype): """Partition a fasta/fastq file into chunks of user-defined size. :param input_f: Filepath to a file or folder of files to partition. :param outdir: The directory to write split files to. :param processes: The number of processes to use to partition the input fileset. :param chunksize: The number of sequences per file. :param filetype: Either 'fasta' or 'fastq'. """ # Gather input files inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "partitioned") pool = init_pool(min(len(inputs), processes)) printVerbose("Partitioning Files...") run_parallel([ PythonRunner(splitK, [ input_, "%s/%s" % (outdir, strip_ixes(input_)), chunksize, filetype ], {"exists": [input_]}) for input_ in inputs ], pool) printVerbose("Done partitioning files.") cleanup_pool(pool)
def ungap_chewbacca(self, input_f, outdir, gapchars, file_ext, processes): """Ungaps a character using Bio python. :param input_f: Filepath to input file or folder to ungap. :param outdir: Filepath to the output directory where ungapped files should be written. :param gapchars: A string containing the gap characters to remove. :param file_ext: Either 'fasta' or 'fastq'. :param processes: The number of threads to use to ungap the input fileset. """ inputs = getInputFiles(input_f, "*.fasta") debugPrintInputInfo(inputs, "ungap.") pool = init_pool(min(len(inputs), processes)) printVerbose("Removing all '%s' from sequences..." % gapchars) # ungap(file_to_clean, output_file_name, gap_char, file_type): run_parallel([ PythonRunner(remove_gap_chars, [ input_, "%s/%s_cleaned.%s" % (outdir, strip_ixes(input_), 'fasta'), gapchars, file_ext ], {"exists": [input_]}) for input_ in inputs ], pool) printVerbose("Done removing.") cleanup_pool(pool)
def splitK(inputFasta, prefix, nbSeqsPerFile, filetype): mySeqs = SeqIO.parse(inputFasta, filetype) chunk = 0 sequences = [] for mySeq in mySeqs: mySeq.seq = mySeq.seq.ungap(".") if len(mySeq.seq) < 200: continue sequences.append(mySeq) if len(sequences) % nbSeqsPerFile == 0: SeqIO.write( sequences, open("%s_part_%d.%s" % (str(prefix), chunk, filetype), 'w'), filetype) sequences = [] chunk += 1 if sequences: SeqIO.write( sequences, open("%s_part_%d.%s" % (str(prefix), chunk, filetype), 'w'), filetype) printVerbose("Split %s into %d parts." % (inputFasta, (chunk + 1)))
def assemble_pear(self, input_f, input_r, outdir, name, processes, pearthreads, extraargstring): """Uses PEAR to assemble paired F/R read files in run_parallel. :param input_f: File path to forward Fastq Reads file or folder. :param input_r: File path to reverse Fastq Reads file or folder. :param outdir: File path to the output directory. :param name: File prefix for the assembled reads. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. :param pearthreads: The number of threads per process to use. """ # "~/programs/pear-0.9.4-bin-64/pear-0.9.4-64 -f %s -r %s -o %s -j %s -m %d" inputs = validate_paired_fastq_reads(input_f, input_r) pool = init_pool(min(len(inputs), processes)) printVerbose("\tAssembling reads with pear") debugPrintInputInfo(inputs, "assemble") run_parallel([ ProgramRunner(ProgramRunnerCommands.ASSEMBLE_PEAR, [ forwards, reverse, "%s/%s_%s" % (outdir, name, getFileName(forwards)), pearthreads ], { "exists": [forwards, reverse], "positive": [pearthreads] }, extraargstring) for forwards, reverse in inputs ], pool) printVerbose("Done assembling sequences...") # Grab all the auxillary files (everything not containing ".assembled." aux_files = getInputFiles(outdir, "*", "*.assembled.*", ignore_empty_files=False) # make aux dir for extraneous files and move them there bulk_move_to_dir(aux_files, makeAuxDir(outdir)) cleanup_pool(pool)
def align_macse(self, input_f, db, outdir, processes, extraargstring): """Aligns sequences by iteratively adding them to a known good alignment. :param input_f: Filepath to an input file or folder to rename. :param db: Filepath to a reference file or folder of reference files for alignment. :param outdir: Filepath to the output directory. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ # "macse_align": "java -jar " + programPaths["MACSE"] + " -prog enrichAlignment -seq \"%s\" -align \ # \"%s\" -seq_lr \"%s\" -maxFS_inSeq 0 -maxSTOP_inSeq 0 -maxINS_inSeq 0 \ # -maxDEL_inSeq 3 -gc_def 5 -fs_lr -10 -stop_lr -10 -out_NT \"%s\"_NT \ # -out_AA \"%s\"_AA -seqToAdd_logFile \"%s\"_log.csv", inputs = getInputFiles(input_f) pool = init_pool(min(len(inputs), processes)) printVerbose("Aligning reads using MACSE") inputs = getInputFiles(input_f) run_parallel([ProgramRunner(ProgramRunnerCommands.MACSE_ALIGN, [db, db, input_] + ["%s/%s" % (outdir, getFileName(input_))] * 3, {"exists": [input_, db]}, extraargstring) for input_ in inputs], pool) printVerbose("Done with MACSE alignment.") cleanup_pool(pool)
def renameWithReplicantCounts(input_fasta, groups_file, output_fasta, filetype): """Covnerts a fasta and a groups file to a sorted, dereplicated, fasta named by abundance. Specifically, each seed in the groups file has the number of sequences it represents (including itself) appended as a suffix. e.g. The groups file entry: BALI_113_ID1 BALI_113_ID2 BALI_113_ID3 would be named as >BALI_113_ID1_3 in the fasta file to show that it represents 3 sequences (itself, and two other sequences) :param input_fasta: Input fasta/fastq file with entries for all items in the groups file. :param groups_file: Input groups file showing clustering/grouping. :param output_fasta: Output file path. :param filetype: Either 'fasta' or 'fastq' :return: Filepath to the output fasta. """ seeds = [] seedSizes = parseGroupsFileToDictOfCounts(groups_file) printVerbose("Indexing reads") reads = SeqIO.index(input_fasta, filetype) printVerbose("Done indexing reads") printVerbose("Renaming sequences") for name, count in sorted(seedSizes.items(), key=operator.itemgetter(1), reverse=True): s = reads[name] s.id = "%s_%s" % (name, count) s.description = "" seeds.append(s) # write in chunks if len(seeds) == 500000: SeqIO.write(seeds, open(output_fasta, 'a'), filetype) seeds = [] # write the rest of the chunk SeqIO.write(seeds, open(output_fasta, 'a'), filetype) printVerbose("Done renaming sequences") return output_fasta
def renameWithReplicantCounts(input_fasta, groups_file, output_fasta, filetype): """Covnerts a fasta and a groups file to a sorted, dereplicated, fasta named by abundance. Specifically, each seed in the groups file has the number of sequences it represents (including itself) appended as a suffix. e.g. The groups file entry: BALI_113_ID1 BALI_113_ID2 BALI_113_ID3 would be named as >BALI_113_ID1_3 in the fasta file to show that it represents 3 sequences (itself, and two other sequences) :param input_fasta: Input fasta/fastq file with entries for all items in the groups file. :param groups_file: Input groups file showing clustering/grouping. :param output_fasta: Output file path. :param filetype: Either 'fasta' or 'fastq' :return: Filepath to the output fasta. """ seeds = [] seedSizes = parseGroupsFileToDictOfCounts(groups_file) printVerbose("Indexing reads") reads = SeqIO.index(input_fasta, filetype) printVerbose("Done indexing reads") printVerbose("Renaming sequences") for name, count in sorted(seedSizes.items(), key=operator.itemgetter(1), reverse=True): s = reads[name] s.id = "%s_%s" % (name, count) s.description = "" seeds.append(s) # write in chunks if len(seeds) == 500000: SeqIO.write(seeds, open(output_fasta, 'a'), filetype) seeds =[] # write the rest of the chunk SeqIO.write(seeds, open(output_fasta, 'a'), filetype) printVerbose("Done renaming sequences") return output_fasta
def merge_chewbacca(self, input_f, outdir, output_filename, output_fileext): """Merges files together into a new output file. :param input_f: Filepath to a directory of input files. :param outdir: Filepath to the output folder. :param program: The program to use to merge files. Choices are ["chewbacca"]. Default: "chewbacca". :param output_filename: The filename of the output file, without an extension. :param output_fileext: The file extension of the output file. :param aux_params: A dictionary of program-specific named-parameters. """ inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "merged") printVerbose("Merging files.") output_file = "%s/%s_MERGED.%s" % (outdir, output_filename, output_fileext) merge_files(inputs, output_file) printVerbose("Done merging.") printVerbose("Merged %d files into %s" % (len(inputs), output_file))
def visualize_otu_sample_comp(self, data_frame, output_file): """Creates a stacked barchart showing the OTU composition in each sample. :param data_frame: A pandas dataframe to graph. :param output_file: Filepath to the output graphics file. """ ncols = len(data_frame.columns.values) nrows = len(data_frame.index.values) printVerbose("Computing dataframe values...") sums = data_frame.sum(0).values data_frame = data_frame.divide(sums) printVerbose("Transposing dataframe...") data_frame.transpose().plot(kind='bar', stacked=True, ylim=(0, 1), figsize=(10 + ncols / .5, 7 + nrows / 10.0), colormap=plt.cm.hsv) plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=5) printVerbose("Saving image %s..." % output_file) plt.savefig(output_file, dpi=200)
def align_clean_macse(self, input_f, ref, samplesdir, outdir, processes, extraargstring=""): """Removes non-nucleotide characters in MACSE aligned sequences for all fasta files in the samples directory (the samplesDir argument). :param input_f: File path to file or folder of files to clean. :param samplesdir: Filepath to the original, unaligned input files (the inputs to the macse aligner). :param ref: Filepath to the reference file used to align the input files. :param outdir: Filepath to the directory to write outputs to. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ # "macse_format": "java -jar " + programPaths["MACSE"] + " -prog exportAlignment -align \"%s\" \ # -charForRemainingFS - -gc_def 5 -out_AA \"%s\" -out_NT \"%s\" -statFile \"%s\"" inputs = getInputFiles(input_f) pool = init_pool(min(len(inputs), processes)) printVerbose("\t %s Processing MACSE alignments") samples_list = getInputFiles(samplesdir) run_parallel([ProgramRunner(ProgramRunnerCommands.MACSE_FORMAT, ["%s/%s_NT" % (input_f, getFileName(sample)), "%s/%s_AA_macse.fasta" % (outdir, getFileName(sample)), "%s/%s_NT_macse.fasta" % (outdir, getFileName(sample)), "%s/%s_macse.csv" % (outdir, getFileName(sample))], {"exists": ["%s/%s_NT" % (input_f, getFileName(sample))]}, extraargstring) for sample in samples_list], pool) printVerbose("\tCleaning MACSE alignments") printVerbose("Processing %s samples..." % len(samples_list)) nt_macse_outs = ["%s/%s_NT_macse.fasta" % (outdir, strip_ixes(sample)) for sample in samples_list] # Clean the alignments from classes.PythonRunner import PythonRunner run_parallel([PythonRunner(remove_refs_from_macse_out, [input_, ref, "%s/%s" % (outdir, "%s_cleaned.fasta" % strip_ixes(input_))], {"exists": [input_, ref]}) for input_ in nt_macse_outs], pool) # Cat the cleaned alignments cleaned_alignments = getInputFiles(outdir, "*_cleaned.fasta") merge_files(cleaned_alignments, "%s/MACSE_OUT_MERGED.fasta" % outdir) aux_dir = makeAuxDir(outdir) aux_files = getInputFiles(outdir, "*", "MACSE_OUT_MERGED.fasta", ignore_empty_files=False) bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)
def update_groups(old_groups_files, new_groups_files, out_dir, out_prefix): """Updates an old_groups file with the results of a new_groups file, and writes the results to a new groups file. E.g. Given the old_groups file lists: old_groups: 1 2 3 4 5 6 and the new_groups file lists new_groups: 1 4 Then return an out_groups file listing out_groups 1 2 3 4 5 6 Finer points: 1. The list of child sequences following the seed should not contain the seed. 2. The size of the cluster_main represented by the seed is the number of children succeeding the seed, plus one for the seed. :param new_groups_files: The current iteration of the groups file. :param old_groups_files: The previous iteration of the groups file. :param out_dir: The resulting updated groups file. :param out_prefix: The prefix for the output filename. :return: Filepath to the updated groups file """ if not (len(old_groups_files) and len(new_groups_files)): print "\n***WARNING***: Received empty file lists. Aborting group file update. If you provided a groups \ file, something went wrong.\n" return printVerbose("Using %s and %s to generate updated groups file %s_updated.groups" % \ (old_groups_files[0], new_groups_files[0], out_prefix)) old_groups_temp_file = "%s/temp_old_merged.groups" % out_dir new_groups_temp_file = "%s/temp_new_merged.groups" % out_dir output_file = "%s/%s_updated.groups" % (out_dir, out_prefix) # Concat the old and new groups files respectively merge_files(old_groups_files, old_groups_temp_file) merge_files(new_groups_files, new_groups_temp_file) # parse the groups files to dictionaries of children old_seeds = parseGroupsFileToDictOfChilden(old_groups_temp_file) new_seeds = parseGroupsFileToDictOfChilden(new_groups_temp_file) new_keys = new_seeds.keys() total = len(new_keys) i = 0 outstring = "" with open(output_file, 'w') as output: for new_seed in new_keys: i += 1 if i % 10000 == 0: printVerbose("Processed %d / %d lines\n" % (i, total)) output.write(outstring) outstring = "" my_old_children = [] children_of_my_new_children = [] # Back in my day, I used to be a seed! if old_seeds.has_key(new_seed): my_old_children = old_seeds[new_seed].split(" ") my_new_children = new_seeds[new_seed].split(" ") for entry in my_new_children: if old_seeds.has_key(entry): children_of_my_new_children += old_seeds[entry].split(" ") # list(set(my_old_children + my_new_children + children_of_my_new_children)) all_my_children = my_old_children + my_new_children + children_of_my_new_children outstring += "%s\t%s\n" % (new_seed, " ".join( set(all_my_children))) # print "Seed %s has %d children" % (new_seed, len(all_my_children)) # print("%s_%d = %d old + %d new + %d children of new" % # (new_seed, len(all_my_children), len(my_old_children), len(my_new_children), # len(children_of_my_new_children))) output.write(outstring) output.close() os.remove(old_groups_temp_file) os.remove(new_groups_temp_file) return output_file
def buildOTUtable(latest_groups_files, inital_samples_files, barcodes_file, out_file): """Given a single barcodes file with all possible \ sample names, a list of the latest groups file(s), and a list of initial samples files \ (mapping each original, undereplicated sequence to its sample name), builds an OTU \ table and writes it to out_file. :param latest_groups_files: A list of the latest groups files. No sequence name may occur in more than one \ groups file. :param inital_samples_files: A list of the inital samples files. This should map each sequence to its parent sample. :param barcodes_file: A single barcodes file listing all valid sample names. :param out_file: Filepath to the output directory """ print "latest_groups_files: %s " % latest_groups_files print "inital_samples_files: %s " % inital_samples_files print "barcodes_file: %s " % barcodes_file seq_to_sample = {} # read the initaial groups/samples file (from rename) # make a single dict from all the groups/samples files mapping seqname to group all_sample_names = set() for samples_file in inital_samples_files: printVerbose("Reading samples file: %s" % samples_file) with open(samples_file, 'r') as current_samples_file: for line in current_samples_file: name, sample = line.split() sample_name = sample.rstrip() seq_to_sample[name] = sample_name all_sample_names.add(sample_name) all_sample_names = sorted(all_sample_names) printVerbose("Found the following sample names:") printVerbose(str(all_sample_names)) #sys.exit(0) with open(out_file, 'w') as out: header_line = "OTU" for sample in all_sample_names: header_line += "\t%s" % sample out.write(header_line + "\n") # GENERATE A DICTIONARY MAPPING SEQUENCE NAMES TO THE SAMPLE THEY CAME FROM # for each line in the latest groups files, for groups_file in latest_groups_files: with open(groups_file, 'r') as current_groups_file: otu = "" children = "" # read the latest groups file for line in current_groups_file: data = line.split("\t") # TODO: if line is empty... need to find the readson for this if not data: continue # found a cluster_main if len(data) == 2: otu = data[0].rstrip() children = data[1].rstrip() # found a singleton elif len(data) == 1: otu = data[0].rstrip() # found a blank line else: pass # GENERATE OTU ABUNDANCE BY SAMPLE # initalize a count_dict with each sample as a key and a value of 0 sample_counts = {} for sample_name in all_sample_names: sample_counts[sample_name] = 0 # for each item in the child list: for child in children.split(): # my_sample = lookup that item in the dict to get its sample name my_sample = seq_to_sample[child] # increment the abundance in that sample sample_counts[my_sample] += 1 # WRITE THE COUNTS TO THE OUT FILE # for each sample in the barcodes list, write otu to a txt file as a single line out_line = otu for sample_name in all_sample_names: out_line += "\t%s" % sample_counts[sample_name] out.write(out_line + "\n") out.close()
def preprocessData(args, pool=Pool(processes=1)): # TODO: test run.name is a single word makeDirOrdie(args.outDir) printVerbose("Preprocessing the data:") # ***************************************************************************************** printVerbose("\t Renaming sequences") # "~/programs/fastx/bin/fastx_renamer -n COUNT -i %s %s" rename_outFile_f = os.path.join("outDir/", os.path.basename(args.input_f)+"_renamed") rename_outFile_r = os.path.join("outDir/", os.path.basename(args.input_r)+"_renamed") pool.map(runInstance, [ProgramRunner("fastx_renamer",[args.input_f, rename_outFile_f], {"exists":[args.input_f]}), ProgramRunner("fastx_renamer",[args.input_r, rename_outFile_r], {"exists":[args.input_r]}), ]) printVerbose("\tRenamed X sequences") # ***************************************************************************************** # Making the contigs using Pear # "~/programs/pear-0.9.4-bin-64/pear-0.9.4-64 -f %s -r %s -o %s -j %s" assembledPrefix = os.path.join("outDir", args.name) pool.map(runInstance, [ProgramRunner("pear", (rename_outFile_f, rename_outFile_r, assembledPrefix, args.threads), {"exists":[rename_outFile_f, rename_outFile_r]}) ]) assembledFastqFile = os.path.join("outDir", args.name+".assembled.fastq") # add py char to a web-page printVerbose("\t %s sequences assembled, %s contigs discarded, %s sequences discarded" % (1,1,1)) # ***************************************************************************************** # Converting fastq to fasta file (do with mothur or BioSeqIO to keep prog deps to a minimum) pool.map(runInstance, [ProgramRunner("fastq.info", [assembledFastqFile], {"exists": [assembledFastqFile]}) ]) assembledFastaFile = os.path.splitext(assembledFastqFile)[0]+".fasta" # TODO: add py char to a web-page printVerbose("\t converted fastq to fasta") # ***************************************************************************************** # Trimming and assigning reads to groups # trim.seqs(fasta=%, oligos=%s, maxambig=0, maxhomop=8, minlength=300, maxlength=550, bdiffs=1, pdiffs=2) pool.map(runInstance, [ProgramRunner("trim.seqs", [assembledFastaFile, args.barcodes], {"exists": [assembledFastaFile]}) ]) printVerbose("\t %s sequences were assigned to groups and %s sequences were discareded") trimmedFasaFile = os.path.splitext(assembledFastqFile)[0]+".trim.fasta" # ***************************************************************************************** # Aligning against the BIOCODETEMPLATE database pool.map(runInstance, [ProgramRunner("align.seqs", [trimmedFasaFile, args.db], {"exists": [trimmedFasaFile]}) ]) printVerbose("\t %s sequences were assigned to groups and %s sequences were discareded")
def cluster_crop(self, input_f, outdir, groupsfile, processes, blocksize, clustpct, maxmcmc, maxsm, rare, blockcount, extraargstring): """Clusters sequences using CROP. :param input_f: Filepath to the input fasta file to cluster. :param outdir: Filepath to the output directory. :param groupsfile: Filepath to the groups file to use as a reference for dereplication counting. :param blocksize: Size of blocks to be used for all rounds (if -b is specified, then -z will not affect the first round. For data set with different average sequence length, this parameter should \ be tuned such that it won't take too long for each block to do pariwise alignment. Hint \ for choosing z: z*L<150,000, where L is the average length of the sequences. :param clustpct: The minimum similarity threshold for clustering. Either 'g' for 95% or 's' for 97%. :param maxmcmc: This parameter specifies the number of iterations of MCMC. Default value is 2000. Increase \ this value to enhance accuracy (recommended value is at least 10*block size). :param maxsm: This parameter specifies the maximum number of 'split and merge' process to run. Max is 20. :param rare: The maximum cluster size allowed to be classified as 'rare'. Clusters are defined as either \ 'abundant' or 'rare'. 'Abundant' clusters will be clustered first, then the 'rare' \ clusters are mapped to the 'abundant' clusters. Finally, 'rare' clusters which cannot be \ mapped will be clustered separately. e.g. If r=5, the clusters with size <=5 will be \ considered 'rare' in above procedure. and r=0 will yield the best accuracy. If you \ believe your data is not too diverse to be handled, then r=0 will be the best choice. :param blockcount: The size of blocks in the first round of clustering. Hint of choosing -b: Each block in the \ first round should contain about 50 sequences. i.e. b=N/50, where N is the number of \ input sequences. Default: # input sequences / z. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ # Grab the fasta file(s) to cluster inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "clustered") pool = init_pool(min(len(inputs), processes)) # RUN CLUSTERING # crop -i %s -o %s -z %s -c %s -e %s -m %s%s run_parallel([ProgramRunner(ProgramRunnerCommands.CLUSTER_CROP, [input_, "%s/%s" % (outdir, strip_ixes(input_)), blocksize, clustpct, maxmcmc, maxsm, rare, blockcount], {"exists": [input_]}, extraargstring) for input_ in inputs], pool) # CLEAN THE OUTPUT GROUPS FILE printVerbose("Parsing the groups file from clustering") clustered_groups_files = getInputFiles(outdir, "*.cluster.list") debugPrintInputInfo(clustered_groups_files, "converted to groups files") run_parallel([PythonRunner(parseCROPoutToGroups, [input_, "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in clustered_groups_files], pool) printVerbose("Done parsing groups file.") # Collect the groups file from clustering with counts removed cleaned_clustered_groups_files = getInputFiles(outdir, "*_uncount.groups", ignore_empty_files=False) # Resolve the user specified names file if necessary final_groups_files = handle_groups_file_update(outdir, groupsfile, cleaned_clustered_groups_files) # GATHER AUX FILES input_dir = getDirName(input_f) aux_files = cleaned_clustered_groups_files aux_files += getInputFiles(input_dir, "*.unique", ignore_empty_files=False) aux_files += getInputFiles(input_dir, "*.unique.list", ignore_empty_files=False) aux_files += getInputFiles(input_dir, "*.unique.TempCenters.Rare", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.cluster", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.cluster.list", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.log", ignore_empty_files=False) aux_files += getInputFiles(".", "LikelihoodRatio.txt", ignore_empty_files=False) # Move the final groups file(s) to the groups dir groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(final_groups_files, groups_dir) # Move aux files to the aux dir aux_dir = makeAuxDir(outdir) bulk_move_to_dir(aux_files, aux_dir) # Cleanup the pool cleanup_pool(pool)