def run(options): if not os.path.exists(options.outDir): os.mkdir(options.outDir) else: shutil.rmtree(options.outDir) os.mkdir(options.outDir) tmpDir = options.outDir + "/intermediate" if not os.path.exists(tmpDir): os.mkdir(tmpDir) # Logging import logging logger = logging.getLogger("pipits_funits") logger.setLevel(logging.DEBUG) streamLoggerFormatter = logging.Formatter( "%(asctime)s %(levelname)s: %(message)s", tc.HEADER + "%Y-%m-%d %H:%M:%S" + tc.ENDC ) streamLogger = logging.StreamHandler() if options.verbose: streamLogger.setLevel(logging.DEBUG) else: streamLogger.setLevel(logging.INFO) streamLogger.setFormatter(streamLoggerFormatter) logger.addHandler(streamLogger) fileLoggerFormatter = logging.Formatter( "%(asctime)s %(levelname)s: %(message)s", tc.HEADER + "%Y-%m-%d %H:%M:%S" + tc.ENDC ) fileLogger = logging.FileHandler(options.outDir + "/log.txt", "w") fileLogger.setLevel(logging.DEBUG) fileLogger.setFormatter(fileLoggerFormatter) logger.addHandler(fileLogger) # Summary file summary_file = open(options.outDir + "/summary_pipits_funits.txt", "w") # Start! logger.info(tc.OKBLUE + "PIPITS FUNITS started" + tc.ENDC) # Scripts EXE_DIR = os.path.dirname(os.path.realpath(__file__)) PIPITS_SCRIPTS_DIR = EXE_DIR # Check integrity of the input file logger.info("Checking input FASTA for illegal characters") record = SeqIO.FastaParser(options.input) for i in record.keys(): description = record[i].description if description.find(" ") != -1: logger.error( 'Error: " " found in the headers. Please remove " " from headers in your FASTA file before proceeding to the next stage.' ) # For summary 1: logger.info("Counting input sequences") numberofsequences = 0 cmd = " ".join(['grep "^>"', options.input, "|", "wc -l"]) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) numberofsequences += int(p.communicate()[0]) p.wait() logger.info("\t" + tc.RED + "Number of input sequences: " + str(numberofsequences) + tc.ENDC) summary_file.write("Number of input sequences: " + str(numberofsequences) + "\n") # Dereplicate logger.info("Dereplicating sequences for efficiency") cmd = " ".join( [ "python", PIPITS_SCRIPTS_DIR + "/dereplicate_fasta.py", "-i", options.input, "-o", tmpDir + "/derep.fasta", "--cluster", tmpDir + "/derep.json", ] ) rc.run_cmd(cmd, logger, options.verbose) # For summary 2: logger.debug("Counting dereplicated sequences") numberofsequences = 0 cmd = " ".join(['grep "^>"', tmpDir + "/derep.fasta", "|", "wc -l"]) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) numberofsequences += int(p.communicate()[0]) p.wait() logger.debug("\t" + tc.RED + "Number of dereplicated sequences: " + str(numberofsequences) + tc.ENDC) # Run ITSx. Chop reads into regions. Re-orientate where needed # ITSx always prints something to STDERR and outputs nothing to STDOUT, so need to supress stdout in non-verbose mode # Returncode is always 0 no matter what... # No way to tell whether it quits with an error or not other than by capturing STDERR with a phrase "FATAL ERROR" - not implemented logger.info("Extracting " + options.ITSx_subregion + " from sequences [ITSx]") cmd = " ".join( [ pd.ITSx, "-i", tmpDir + "/derep.fasta", "-o", tmpDir + "/derep", "--preserve", "T", "-t", "F", "--cpu", options.threads, "--save_regions", options.ITSx_subregion, ] ) rc.run_cmd_ITSx(cmd, logger, options.verbose) # Removing short sequences (<100bp) logger.info("Removing sequences below < 100bp") cmd = " ".join( [ "python", PIPITS_SCRIPTS_DIR + "/fasta_filter_by_length.py", "-i", tmpDir + "/derep." + options.ITSx_subregion + ".fasta", "-o", tmpDir + "/derep." + options.ITSx_subregion + ".sizefiltered.fasta", "-l 100", ] ) rc.run_cmd(cmd, logger, options.verbose) # Re-inflate logger.info("Re-inflating sequences") cmd = " ".join( [ "python", PIPITS_SCRIPTS_DIR + "/inflate_fasta.py", "-i", tmpDir + "/derep." + options.ITSx_subregion + ".sizefiltered.fasta", "-o", options.outDir + "/ITS.fasta", "--cluster", tmpDir + "/derep.json", ] ) rc.run_cmd(cmd, logger, options.verbose) # Count number of ITS logger.info("Counting sequences after re-inflation") numberofsequences = 0 cmd = " ".join(['grep "^>"', options.outDir + "/ITS.fasta", "|", "wc -l"]) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) numberofsequences = int(p.communicate()[0]) p.wait() if numberofsequences == 0: logger.info(tc.RED + "\tNumber of sequences with ITS subregion: " + str(numberofsequences) + tc.ENDC) logger.info(tc.RED + "Have you chosen the right subregion? Exiting as no sequences to process." + tc.ENDC) summary_file.write("Number of sequences with ITS subregion: " + str(numberofsequences) + "\n") exit(1) else: logger.info(tc.RED + "\tNumber of sequences with ITS subregion: " + str(numberofsequences) + tc.ENDC) summary_file.write("Number of sequences with ITS subregion: " + str(numberofsequences) + "\n") """ # Concatenating ITS1 and ITS2 logger.info("Concatenating ITS1 and ITS2 ...") cmd = " ".join(["python", PIPITS_SCRIPTS_DIR + "/concatenate_fasta.py", "-1", options.outDir + "/ITS1.fasta" , "-2", options.outDir + "/ITS2.fasta", "-o", options.outDir + "/ITS.fasta"]) rc.run_cmd(cmd, logger, options.verbose) logger.info("Concatenating ITS1 and ITS2 " + tc.OKGREEN + "(Done)" + tc.ENDC) """ # Finally move and delete tmp if options.remove: logger.info("Cleaning temporary directory") shutil.move(tmpDir + "/derep.summary.txt", options.outDir + "/ITSx_summary.txt") shutil.rmtree(tmpDir) logger.info( tc.OKBLUE + 'PIPITS FUNITS ended successfully. "' + "ITS.fasta" + '" created in "' + options.outDir + '"' + tc.ENDC ) logger.info( tc.OKYELLOW + "Next Step: PIPITS PROCESS [ Suggestion: pipits_process -i " + options.outDir + "/" + "ITS.fasta -o out_process ]" + tc.ENDC ) print("") summary_file.close()
def run(options): PIPITS_PREP_OUTPUT = "prepped.fasta" # Make directories (outdir and tmpdir) if not os.path.exists(options.outDir): os.mkdir(options.outDir) else: shutil.rmtree(options.outDir) os.mkdir(options.outDir) tmpDir = options.outDir + "/intermediate" if not os.path.exists(tmpDir): os.mkdir(tmpDir) # Logging import logging logger = logging.getLogger("pipits_prep") logger.setLevel(logging.DEBUG) streamLoggerFormatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s", tc.HEADER + "%Y-%m-%d %H:%M:%S" + tc.ENDC) streamLogger = logging.StreamHandler() if options.verbose: streamLogger.setLevel(logging.DEBUG) else: streamLogger.setLevel(logging.INFO) streamLogger.setFormatter(streamLoggerFormatter) logger.addHandler(streamLogger) fileLoggerFormatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s", tc.HEADER + "%Y-%m-%d %H:%M:%S" + tc.ENDC) fileLogger = logging.FileHandler(options.outDir + "/log.txt", "w") fileLogger.setLevel(logging.DEBUG) fileLogger.setFormatter(fileLoggerFormatter) logger.addHandler(fileLogger) # Summary file summary_file = open(options.outDir + "/summary_pipits_prep.txt", "w") # Start! logger.info(tc.OKBLUE + "PIPITS PREP started" + tc.ENDC) EXE_DIR = os.path.dirname(os.path.realpath(__file__)) # Check for the presence of rawdata directory logger.debug("Checking for presence of input directory") if not os.path.exists(options.dataDir): logger.error("Cannot find \"" + options.dataDir + "\" directory. Ensure you have the correct name of the directory where your Illumina sequences are stored") exit(1) fastqs_l = [] fastqs_f = [] fastqs_r = [] # if list is provided... if options.listfile: logger.info("Processing user-provided listfile") try: listfile = open(options.listfile, "r") except IOError: logger.error("\"" + options.listfile + "\" not found.") exit(1) for l in listfile: if l.strip(" ").strip("\n") != "" and not l.startswith("#"): l = l.rstrip().split("\t") fastqs_l.append(l[0]) fastqs_f.append(l[1]) fastqs_r.append(l[2]) listfile.close() # if not provided if not options.listfile: logger.info("Getting list of fastq files and sample ID from input folder") fastqs = [] for file in os.listdir(options.dataDir): if \ file.endswith(".fastq.gz") or \ file.endswith(".bz2") or \ file.endswith(".fastq"): fastqs.append(file) if len(fastqs) % 2 != 0: logger.error("There are missing pair(s) in the Illumina sequences. Check your files and labelling") exit(1) coin = True for fastq in sorted(fastqs): if coin == True: fastqs_f.append(fastq) else: fastqs_r.append(fastq) coin = not coin for i in range(len(fastqs_f)): if fastqs_f[i].split("_")[0] != fastqs_r[i].split("_")[0]: logger.error("Problem with labelling FASTQ files.") exit(1) fastqs_l.append(fastqs_f[i].split("_")[0]) # Check if len(fastqs_f) != len(fastqs_r): logger.error("Different number of forward FASTQs and reverse FASTQs") exit(1) # Done loading. Now check the file extensions. filenameextensions = [] for filename in (fastqs_f + fastqs_r): filenameextensions.append(filename.split(".")[-1].rstrip()) if len(set(filenameextensions)) > 1: logger.error("More than two types of extensions") exit(1) extensionType = next(iter(filenameextensions)) # For summary 1: logger.info("Counting sequences in rawdata") numberofsequences = 0 for fr in fastqs_f: if extensionType == "gz": cmd = " ".join(["zcat", options.dataDir + "/" + fr, "|", "wc -l"]) elif extensionType =="bz2": cmd = " ".join(["bzcat", options.dataDir + "/" + fr, "|", "wc -l"]) elif extensionType =="fastq": cmd = " ".join(["cat", options.dataDir + "/" + fr, "|", "wc -l"]) else: logger.error("Unknown extension type.") exit(1) logger.debug(cmd) p = subprocess.Popen(cmd, shell=True, stdout = subprocess.PIPE) numberofsequences += int(p.communicate()[0]) / 4 p.wait() logger.info("\t" + tc.RED + "Number of paired-end reads in rawdata: " + str(numberofsequences) + tc.ENDC) summary_file.write("Number of paired-end reads in rawdata: " + str(numberofsequences) + "\n") # Join paired-end reads logger.info("Joining paired-end reads" + "[" + options.joiner_method + "]") if not os.path.exists(tmpDir + "/joined"): os.mkdir(tmpDir + "/joined") for i in range(len(fastqs_l)): if extensionType == "gz": cmd = " ".join(["gunzip -c", options.dataDir + "/" + fastqs_f[i], ">", tmpDir + "/joined/" + fastqs_f[i] + ".tmp"]) rc.run_cmd(cmd, logger, options.verbose) cmd = " ".join(["gunzip -c", options.dataDir + "/" + fastqs_r[i], ">", tmpDir + "/joined/" + fastqs_r[i] + ".tmp"]) rc.run_cmd(cmd, logger, options.verbose) elif extensionType == "bz2": cmd = " ".join(["bunzip2 -c", options.dataDir + "/" + fastqs_f[i], ">", tmpDir + "/joined/" + fastqs_f[i] + ".tmp"]) rc.run_cmd(cmd, logger, options.verbose) cmd = " ".join(["bunzip2 -c", options.dataDir + "/" + fastqs_r[i], ">", tmpDir + "/joined/" + fastqs_r[i] + ".tmp"]) rc.run_cmd(cmd, logger, options.verbose) elif extensionType == "fastq": cmd = " ".join(["ln -sf", os.path.abspath(options.dataDir + "/" + fastqs_f[i]), tmpDir + "/joined/" + fastqs_f[i] + ".tmp"]) rc.run_cmd(cmd, logger, options.verbose) cmd = " ".join(["ln -sf", os.path.abspath(options.dataDir + "/" + fastqs_r[i]), tmpDir + "/joined/" + fastqs_r[i] + ".tmp"]) rc.run_cmd(cmd, logger, options.verbose) else: print(extensionType) logger.error("Unknown extension found.") exit(1) # joiner_method = "PEAR" if options.joiner_method == "PEAR": cmd = " ".join([pd.PEAR, "-f", tmpDir + "/joined/" + fastqs_f[i] + ".tmp", "-r", tmpDir + "/joined/" + fastqs_r[i] + ".tmp", "-o", tmpDir + "/joined/" + fastqs_l[i], "-j", options.threads, "-b", options.base_phred_quality_score, "-q 30", "-p 0.0001"]) rc.run_cmd(cmd, logger, options.verbose) cmd = " ".join(["rm -v", tmpDir + "/joined/" + fastqs_f[i] + ".tmp", tmpDir + "/joined/" + fastqs_r[i] + ".tmp"]) rc.run_cmd(cmd, logger, options.verbose) cmd = " ".join(["mv -f", tmpDir + "/joined/" + fastqs_l[i] + ".assembled.fastq", tmpDir + "/joined/" + fastqs_l[i] + ".joined.fastq"]) rc.run_cmd(cmd, logger, options.verbose) elif options.joiner_method == "FASTQJOIN": cmd = " ".join(["fastq-join", tmpDir + "/joined/" + fastqs_f[i] + ".tmp", tmpDir + "/joined/" + fastqs_r[i] + ".tmp", "-o", tmpDir + "/joined/" + fastqs_l[i] + ".joined.fastq"]) rc.run_cmd(cmd, logger, options.verbose) cmd = " ".join(["mv -f", tmpDir + "/joined/" + fastqs_l[i] + ".joined.fastqjoin", tmpDir + "/joined/"+ fastqs_l[i] +".joined.fastq"]) rc.run_cmd(cmd, logger, options.verbose) # For summary 2: numberofsequences = 0 for i in range(len(fastqs_l)): cmd = " ".join(["cat", tmpDir + "/joined/" + fastqs_l[i] + ".joined.fastq", "|", "wc -l"]) logger.debug(cmd) p = subprocess.Popen(cmd, shell=True, stdout = subprocess.PIPE) numberofsequences += int(p.communicate()[0]) / 4 p.wait() logger.info("\t" + tc.RED + "Number of joined reads: " + str(numberofsequences) + tc.ENDC) summary_file.write("Number of joined reads: " + str(numberofsequences) + "\n") # Quality filter logger.info("Quality filtering [FASTX]") if not os.path.exists(tmpDir + "/fastqqualityfiltered"): os.mkdir(tmpDir + "/fastqqualityfiltered") for i in range(len(fastqs_f)): cmd = " ".join([pd.FASTX_FASTQ_QUALITY_FILTER, "-i", tmpDir + "/joined/" + fastqs_l[i] + ".joined.fastq", "-o", tmpDir + "/fastqqualityfiltered/" + fastqs_l[i] + ".fastq", "-q", options.FASTX_fastq_quality_filter_q, "-p", options.FASTX_fastq_quality_filter_p, "-Q" + options.base_phred_quality_score]) rc.run_cmd(cmd, logger, options.verbose) # For summary 3: numberofsequences = 0 for i in range(len(fastqs_l)): cmd = " ".join(["cat", tmpDir + "/fastqqualityfiltered/" + fastqs_l[i] + ".fastq", "|", "wc -l"]) p = subprocess.Popen(cmd, shell=True, stdout = subprocess.PIPE) numberofsequences += int(p.communicate()[0]) / 4 p.wait() logger.info("\t" + tc.RED + "Number of quality filtered reads: " + str(numberofsequences) + tc.ENDC) summary_file.write("Number of quality filtered reads: " + str(numberofsequences) + "\n") # Removing reads with \"N\" and FASTA conversion if options.FASTX_fastq_to_fasta_n: logger.info("Converting FASTQ to FASTA [FASTX]") else: logger.info("Converting FASTQ to FASTA and also removing reads with \"N\" nucleotide [FASTX]") if not os.path.exists(tmpDir + "/fastqtofasta"): os.mkdir(tmpDir + "/fastqtofasta") fastq_to_fasta_n = "" if options.FASTX_fastq_to_fasta_n: fastq_to_fasta_n = "-n" for i in range(len(fastqs_f)): cmd = " ".join([pd.FASTX_FASTQ_TO_FASTA, "-i", tmpDir + "/fastqqualityfiltered/" + fastqs_l[i] + ".fastq", "-o", tmpDir + "/fastqtofasta/" + fastqs_l[i] + ".fasta", "-Q33", fastq_to_fasta_n]) rc.run_cmd(cmd, logger, options.verbose) # For summary 3: numberofsequences = 0 for i in range(len(fastqs_l)): cmd = " ".join(["grep \"^>\"", tmpDir + "/fastqtofasta/" + fastqs_l[i] + ".fasta", "|", "wc -l"]) p = subprocess.Popen(cmd, shell=True, stdout = subprocess.PIPE) numberofsequences += int(p.communicate()[0]) p.wait() logger.info("\t" + tc.RED + "Number of N-less quality filtered sequences: " + str(numberofsequences) + tc.ENDC) summary_file.write("Number of N-less quality filtered sequences: " + str(numberofsequences) + "\n") # Re-ID and re-index FASTA and merging them all logger.info("Re-IDing and indexing FASTA, and merging all into a single file") outfileFinalFASTA = open(options.outDir + "/" + PIPITS_PREP_OUTPUT, "w") for i in range(len(fastqs_f)): line_index = 1 logger.debug("Reading " + tmpDir + "/fastqtofasta/" + fastqs_l[i] + ".fasta") infile_fasta = open(tmpDir + "/fastqtofasta/" + fastqs_l[i] + ".fasta") for line in infile_fasta: if line.startswith(">"): outfileFinalFASTA.write(">" + fastqs_l[i] + "_" + str(line_index) + "\n") line_index += 1 else: outfileFinalFASTA.write(line.rstrip() + "\n") outfileFinalFASTA.close() # Clean up tmp_directory if options.remove: logger.info("Cleaning temporary directory") shutil.rmtree(tmpDir) logger.info(tc.OKBLUE + "PIPITS PREP ended successfully. \"" + PIPITS_PREP_OUTPUT + "\" created in \"" + options.outDir + "\"" + tc.ENDC) logger.info(tc.OKYELLOW + "Next Step: PIPITS FUNITS [ Suggestion: pipits_funits -i " + options.outDir + "/" + PIPITS_PREP_OUTPUT + " -o out_funits -x YOUR_ITS_SUBREGION ]" + tc.ENDC) print("") summary_file.close()
def run(options): # Check file exists if not os.path.exists(options.input): print("Error: Input file doesn't exist") exit(1) EXE_DIR = os.path.dirname(os.path.realpath(__file__)) if not os.path.exists(options.outDir): os.mkdir(options.outDir) else: shutil.rmtree(options.outDir) os.mkdir(options.outDir) tmpDir = options.outDir + "/intermediate" if not os.path.exists(tmpDir): os.mkdir(tmpDir) # Logging import logging logger = logging.getLogger("pipits_process") logger.setLevel(logging.DEBUG) streamLoggerFormatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s", tc.HEADER + "%Y-%m-%d %H:%M:%S" + tc.ENDC) streamLogger = logging.StreamHandler() if options.verbose: streamLogger.setLevel(logging.DEBUG) else: streamLogger.setLevel(logging.INFO) streamLogger.setFormatter(streamLoggerFormatter) logger.addHandler(streamLogger) fileLoggerFormatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s", tc.HEADER + "%Y-%m-%d %H:%M:%S" + tc.ENDC) fileLogger = logging.FileHandler(options.outDir + "/log.txt", "w") fileLogger.setLevel(logging.DEBUG) fileLogger.setFormatter(fileLoggerFormatter) logger.addHandler(fileLogger) # Summary file #summary_file = open(options.outDir + "/summary_pipits_process.txt", "w") # Start logger.info(tc.OKBLUE + "PIPITS PROCESS started" + tc.ENDC) # Check if the file is empty if os.stat(options.input).st_size == 0: logger.error("Input file is empty!") exit(0) # Derep with sgtk logger.info("Dereplicating and removing unique sequences prior to picking OTUs") cmd = " ".join([pd.VSEARCH, "--derep_fulllength", options.input, "--output", tmpDir + "/input_nr.fasta", "--minuniquesize 2", "--sizeout", "--threads", options.threads]) rc.run_cmd_VSEARCH(cmd, logger, options.verbose) #filesize = os.path.getsize(tmpDir + "/input_nr.fasta") / 1000.0 #logger.info("Dereplicating " + tc.OKGREEN + "(Done) " + tc.ENDC) #logger.info("\t" + tc.RED + "File size after initial dereplication: " + str(filesize) + " MB" + tc.ENDC) # Check if the file is empty if os.stat(tmpDir + "/input_nr.fasta").st_size == 0: logger.info(tc.OKYELLOW + "After dereplicating and removing unique sequences, there aren't no sequences! Processing stopped." + tc.ENDC) exit(0) # OTU clustering logger.info("Picking OTUs [VSEARCH]") cmd = " ".join([pd.VSEARCH, "--cluster_fast", tmpDir + "/input_nr.fasta", "--id", options.VSEARCH_id, "--centroids", tmpDir + "/input_nr_otus.fasta", "--uc", tmpDir + "/input_nr_otus.uc", "--threads", options.threads]) rc.run_cmd_VSEARCH(cmd, logger, options.verbose) # Chimera removal logger.info("Removing chimeras [VSEARCH]") cmd = " ".join([pd.VSEARCH, "--uchime_ref", tmpDir + "/input_nr_otus.fasta", "--db", pd.UNITE_REFERENCE_DATA_CHIMERA, "--nonchimeras", tmpDir + "/input_nr_otus_nonchimeras.fasta", "--threads", options.threads]) rc.run_cmd_VSEARCH(cmd, logger, options.verbose) # Rename OTUs logger.info("Renaming OTUs") def renumberOTUS(): handle_in = open(tmpDir + "/input_nr_otus_nonchimeras.fasta", "rU") handle_out = open(tmpDir + "/input_nr_otus_nonchimeras_relabelled.fasta", "w") for line in handle_in: if line.startswith(">"): newlabel = line[1:].split(";")[0] handle_out.write(">" + newlabel + "\n") else: handle_out.write(line.rstrip() + "\n") handle_in.close() handle_out.close() renumberOTUS() # Map reads to OTUs logger.info("Mapping reads onto centroids [VSEARCH]") cmd = " ".join([pd.VSEARCH, "--usearch_global", options.input, "--db", tmpDir + "/input_nr_otus_nonchimeras_relabelled.fasta", "--id", options.VSEARCH_id, "--uc", tmpDir + "/otus.uc", "--threads", options.threads]) rc.run_cmd_VSEARCH(cmd, logger, options.verbose) # OTU construction logger.info("Making OTU table") cmd = " ".join(["python", EXE_DIR + "/pipits_uc/uc2otutab.py", tmpDir + "/otus.uc", ">", tmpDir + "/otu_table_prelim.txt"]) rc.run_cmd_VSEARCH(cmd, logger, options.verbose) # Convert to biom logger.info("Converting classic tabular OTU into a BIOM format [BIOM]") try: os.remove(tmpDir + "/otu_table_prelim.biom") except OSError: pass cmd = " ".join([pd.BIOM, "convert", "-i", tmpDir + "/otu_table_prelim.txt", "-o", tmpDir + "/otu_table_prelim.biom", "--table-type=\"OTU table\""]) rc.run_cmd(cmd, logger, options.verbose) # Classifying OTUs # http://sourceforge.net/projects/rdp-classifier/files/RDP_Classifier_TrainingData/ logger.info("Assigning taxonomy [RDP Classifier]") cmd = " ".join(["java", "-jar", pd.RDP_CLASSIFIER_JAR, "classify", "-t", pd.UNITE_RETRAINED_DIR + "/rRNAClassifier.properties", "-o", options.outDir + "/assigned_taxonomy.txt", tmpDir + "/input_nr_otus_nonchimeras_relabelled.fasta"]) rc.run_cmd(cmd, logger, options.verbose) # Reformatting RDP_CLASSIFIER output for biom logger.info("Reformatting RDP_Classifier output") cmd = " ".join(["python", EXE_DIR + "/reformatAssignedTaxonomy.py", "-i", options.outDir + "/assigned_taxonomy.txt" , "-o", options.outDir + "/assigned_taxonomy_reformatted_filtered.txt", "-c", options.RDP_assignment_threshold]) rc.run_cmd(cmd, logger, options.verbose) # Adding RDP_CLASSIFIER output to OTU table logger.info("Adding assignment to OTU table [BIOM]") try: os.remove(options.outDir + "/otu_table.biom") except OSError: pass cmd = " ".join([pd.BIOM, "add-metadata", "-i", tmpDir + "/otu_table_prelim.biom", "-o", options.outDir + "/otu_table.biom", "--observation-metadata-fp", options.outDir + "/assigned_taxonomy_reformatted_filtered.txt", "--observation-header", "OTUID,taxonomy,confidence", "--sc-separated", "taxonomy", "--float-fields", "confidence"]) rc.run_cmd(cmd, logger, options.verbose) # Convert BIOM to TABLE logger.info("Converting OTU table with taxa assignment into a BIOM format [BIOM]") try: os.remove(options.outDir + "/otu_table.txt") except OSError: pass cmd = " ".join([pd.BIOM, "convert", "-i", options.outDir + "/otu_table.biom", "-o", options.outDir + "/otu_table.txt", "--header-key taxonomy", "-b"]) rc.run_cmd(cmd, logger, options.verbose) # Make phylotyp table logger.info("Phylotyping OTU table") cmd = " ".join(["python", EXE_DIR + "/phylotype_biom.py", "-i", options.outDir + "/otu_table.biom", "-o", options.outDir + "/phylotype_table.txt"]) rc.run_cmd(cmd, logger, options.verbose) try: os.remove(options.outDir + "/phylotype_table.biom") except OSError: pass cmd = " ".join([pd.BIOM, "convert", "-i", options.outDir + "/phylotype_table.txt", "-o", options.outDir + "/phylotype_table.biom", "--table-type=\"OTU table\" --process-obs-metadata=\"taxonomy\""]) rc.run_cmd(cmd, logger, options.verbose) # Move representative sequence file to outDir shutil.move(tmpDir + "/input_nr_otus_nonchimeras_relabelled.fasta", options.outDir + "/repseqs.fasta") # Remove tmp if options.remove: logger.info("Cleaning temporary directory") shutil.rmtree(tmpDir) # Final stats ############################# # Import json formatted OTU # ############################# def biomstats(BIOMFILE): import json jsondata = open(BIOMFILE) biom = json.load(jsondata) sampleSize = int(biom["shape"][1]) otus = int(biom["shape"][0]) taxonomies = [] for i in range(len(biom["rows"])): taxonomies.append("; ".join(biom["rows"][i]["metadata"]["taxonomy"])) sampleids = [] for i in range(len(biom["columns"])): sampleids.append(biom["columns"][i]["id"]) import numpy as np # BIOM table into matrix matrix = np.zeros(shape=(otus, sampleSize)) for i in biom["data"]: matrix[i[0], i[1]] = i[2] totalCount = matrix.sum() return totalCount, otus, sampleSize otu_reads_count, otu_count, otu_sample_count = biomstats(options.outDir + "/otu_table.biom") phylo_reads_count, phylo_count, phylo_sample_count = biomstats(options.outDir + "/phylotype_table.biom") outfile = open(options.outDir + "/summary_pipits_process.txt", "w") outfile.write("No.of reads after singletons and chimera removal: " + str(int(otu_reads_count)) + "\n") outfile.write("Number of OTUs: " + str(otu_count) + "\n") outfile.write("Number of phylotypes: " + str(phylo_count) + "\n") outfile.write("Number of samples: " + str(otu_sample_count) + "\n") logger.info(tc.RED + "\tNumber of reads after singletons and chimera removal: " + str(int(otu_reads_count)) + tc.ENDC) logger.info(tc.RED + "\tNumber of OTUs: " + str(otu_count) + tc.ENDC) logger.info(tc.RED + "\tNumber of phylotypes: " + str(phylo_count) + tc.ENDC) logger.info(tc.RED + "\tNumber of samples: " + str(otu_sample_count) + tc.ENDC) # Done! logger.info(tc.OKBLUE + "PIPITS_PROCESS ended successfully." + tc.ENDC) logger.info(tc.OKYELLOW + "Resulting files are in \"" + options.outDir + "\" directory" + tc.ENDC)