Beispiel #1
0
def run(options):

    if not os.path.exists(options.outDir):
        os.mkdir(options.outDir)
    else:
        shutil.rmtree(options.outDir)
        os.mkdir(options.outDir)
    tmpDir = options.outDir + "/intermediate"
    if not os.path.exists(tmpDir):
        os.mkdir(tmpDir)

    # Logging
    import logging

    logger = logging.getLogger("pipits_funits")
    logger.setLevel(logging.DEBUG)

    streamLoggerFormatter = logging.Formatter(
        "%(asctime)s %(levelname)s: %(message)s", tc.HEADER + "%Y-%m-%d %H:%M:%S" + tc.ENDC
    )

    streamLogger = logging.StreamHandler()
    if options.verbose:
        streamLogger.setLevel(logging.DEBUG)
    else:
        streamLogger.setLevel(logging.INFO)
    streamLogger.setFormatter(streamLoggerFormatter)
    logger.addHandler(streamLogger)

    fileLoggerFormatter = logging.Formatter(
        "%(asctime)s %(levelname)s: %(message)s", tc.HEADER + "%Y-%m-%d %H:%M:%S" + tc.ENDC
    )
    fileLogger = logging.FileHandler(options.outDir + "/log.txt", "w")
    fileLogger.setLevel(logging.DEBUG)
    fileLogger.setFormatter(fileLoggerFormatter)
    logger.addHandler(fileLogger)

    # Summary file
    summary_file = open(options.outDir + "/summary_pipits_funits.txt", "w")

    # Start!
    logger.info(tc.OKBLUE + "PIPITS FUNITS started" + tc.ENDC)

    # Scripts
    EXE_DIR = os.path.dirname(os.path.realpath(__file__))
    PIPITS_SCRIPTS_DIR = EXE_DIR

    # Check integrity of the input file
    logger.info("Checking input FASTA for illegal characters")
    record = SeqIO.FastaParser(options.input)
    for i in record.keys():
        description = record[i].description
        if description.find(" ") != -1:
            logger.error(
                'Error: "  " found in the headers. Please remove " " from headers in your FASTA file before proceeding to the next stage.'
            )

    # For summary 1:
    logger.info("Counting input sequences")
    numberofsequences = 0
    cmd = " ".join(['grep "^>"', options.input, "|", "wc -l"])
    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
    numberofsequences += int(p.communicate()[0])
    p.wait()
    logger.info("\t" + tc.RED + "Number of input sequences: " + str(numberofsequences) + tc.ENDC)
    summary_file.write("Number of input sequences: " + str(numberofsequences) + "\n")

    # Dereplicate
    logger.info("Dereplicating sequences for efficiency")
    cmd = " ".join(
        [
            "python",
            PIPITS_SCRIPTS_DIR + "/dereplicate_fasta.py",
            "-i",
            options.input,
            "-o",
            tmpDir + "/derep.fasta",
            "--cluster",
            tmpDir + "/derep.json",
        ]
    )
    rc.run_cmd(cmd, logger, options.verbose)

    # For summary 2:
    logger.debug("Counting dereplicated sequences")
    numberofsequences = 0
    cmd = " ".join(['grep "^>"', tmpDir + "/derep.fasta", "|", "wc -l"])
    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
    numberofsequences += int(p.communicate()[0])
    p.wait()
    logger.debug("\t" + tc.RED + "Number of dereplicated sequences: " + str(numberofsequences) + tc.ENDC)

    # Run ITSx. Chop reads into regions. Re-orientate where needed
    # ITSx always prints something to STDERR and outputs nothing to STDOUT, so need to supress stdout in non-verbose mode
    # Returncode is always 0 no matter what...
    # No way to tell whether it quits with an error or not other than by capturing STDERR with a phrase "FATAL ERROR" - not implemented
    logger.info("Extracting " + options.ITSx_subregion + " from sequences [ITSx]")
    cmd = " ".join(
        [
            pd.ITSx,
            "-i",
            tmpDir + "/derep.fasta",
            "-o",
            tmpDir + "/derep",
            "--preserve",
            "T",
            "-t",
            "F",
            "--cpu",
            options.threads,
            "--save_regions",
            options.ITSx_subregion,
        ]
    )
    rc.run_cmd_ITSx(cmd, logger, options.verbose)

    # Removing short sequences (<100bp)
    logger.info("Removing sequences below < 100bp")
    cmd = " ".join(
        [
            "python",
            PIPITS_SCRIPTS_DIR + "/fasta_filter_by_length.py",
            "-i",
            tmpDir + "/derep." + options.ITSx_subregion + ".fasta",
            "-o",
            tmpDir + "/derep." + options.ITSx_subregion + ".sizefiltered.fasta",
            "-l 100",
        ]
    )
    rc.run_cmd(cmd, logger, options.verbose)

    # Re-inflate
    logger.info("Re-inflating sequences")
    cmd = " ".join(
        [
            "python",
            PIPITS_SCRIPTS_DIR + "/inflate_fasta.py",
            "-i",
            tmpDir + "/derep." + options.ITSx_subregion + ".sizefiltered.fasta",
            "-o",
            options.outDir + "/ITS.fasta",
            "--cluster",
            tmpDir + "/derep.json",
        ]
    )
    rc.run_cmd(cmd, logger, options.verbose)

    # Count number of ITS
    logger.info("Counting sequences after re-inflation")
    numberofsequences = 0

    cmd = " ".join(['grep "^>"', options.outDir + "/ITS.fasta", "|", "wc -l"])
    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    numberofsequences = int(p.communicate()[0])
    p.wait()

    if numberofsequences == 0:
        logger.info(tc.RED + "\tNumber of sequences with ITS subregion: " + str(numberofsequences) + tc.ENDC)
        logger.info(tc.RED + "Have you chosen the right subregion? Exiting as no sequences to process." + tc.ENDC)
        summary_file.write("Number of sequences with ITS subregion: " + str(numberofsequences) + "\n")
        exit(1)
    else:
        logger.info(tc.RED + "\tNumber of sequences with ITS subregion: " + str(numberofsequences) + tc.ENDC)
        summary_file.write("Number of sequences with ITS subregion: " + str(numberofsequences) + "\n")

    """
    # Concatenating ITS1 and ITS2
    logger.info("Concatenating ITS1 and ITS2 ...")
    cmd = " ".join(["python", PIPITS_SCRIPTS_DIR + "/concatenate_fasta.py", 
                    "-1", options.outDir + "/ITS1.fasta" , 
                    "-2", options.outDir + "/ITS2.fasta",
                    "-o", options.outDir + "/ITS.fasta"])
    rc.run_cmd(cmd, logger, options.verbose)
    logger.info("Concatenating ITS1 and ITS2 " + tc.OKGREEN + "(Done)" + tc.ENDC)
    """

    # Finally move and delete tmp
    if options.remove:
        logger.info("Cleaning temporary directory")
        shutil.move(tmpDir + "/derep.summary.txt", options.outDir + "/ITSx_summary.txt")
        shutil.rmtree(tmpDir)

    logger.info(
        tc.OKBLUE
        + 'PIPITS FUNITS ended successfully. "'
        + "ITS.fasta"
        + '" created in "'
        + options.outDir
        + '"'
        + tc.ENDC
    )
    logger.info(
        tc.OKYELLOW
        + "Next Step: PIPITS PROCESS [ Suggestion: pipits_process -i "
        + options.outDir
        + "/"
        + "ITS.fasta -o out_process ]"
        + tc.ENDC
    )
    print("")
    summary_file.close()
Beispiel #2
0
def run(options):

    PIPITS_PREP_OUTPUT = "prepped.fasta"


    # Make directories (outdir and tmpdir)
    if not os.path.exists(options.outDir):
        os.mkdir(options.outDir)
    else:
        shutil.rmtree(options.outDir)
        os.mkdir(options.outDir)

    tmpDir = options.outDir + "/intermediate"
    if not os.path.exists(tmpDir):
        os.mkdir(tmpDir)


    # Logging
    import logging
    logger = logging.getLogger("pipits_prep")
    logger.setLevel(logging.DEBUG)

    streamLoggerFormatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s", tc.HEADER + "%Y-%m-%d %H:%M:%S" + tc.ENDC)
    streamLogger = logging.StreamHandler()
    if options.verbose:
        streamLogger.setLevel(logging.DEBUG)
    else:
        streamLogger.setLevel(logging.INFO)
    streamLogger.setFormatter(streamLoggerFormatter)
    logger.addHandler(streamLogger)

    fileLoggerFormatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s", tc.HEADER + "%Y-%m-%d %H:%M:%S" + tc.ENDC)
    fileLogger = logging.FileHandler(options.outDir + "/log.txt", "w")
    fileLogger.setLevel(logging.DEBUG)
    fileLogger.setFormatter(fileLoggerFormatter)
    logger.addHandler(fileLogger)

    # Summary file
    summary_file = open(options.outDir + "/summary_pipits_prep.txt", "w")

    # Start!
    logger.info(tc.OKBLUE + "PIPITS PREP started" + tc.ENDC)

    EXE_DIR = os.path.dirname(os.path.realpath(__file__))


    # Check for the presence of rawdata directory
    logger.debug("Checking for presence of input directory")
    if not os.path.exists(options.dataDir):
        logger.error("Cannot find \"" + options.dataDir + "\" directory. Ensure you have the correct name of the directory where your Illumina sequences are stored")
        exit(1)


    fastqs_l = []
    fastqs_f = []
    fastqs_r = []

    # if list is provided...
    if options.listfile:
        logger.info("Processing user-provided listfile")
        try:
            listfile = open(options.listfile, "r")
        except IOError:
            logger.error("\"" + options.listfile + "\" not found.")
            exit(1)

        for l in listfile:
            if l.strip(" ").strip("\n") != "" and not l.startswith("#"):
                l = l.rstrip().split("\t")
                fastqs_l.append(l[0])
                fastqs_f.append(l[1])
                fastqs_r.append(l[2])
        listfile.close()


    # if not provided
    if not options.listfile:
        logger.info("Getting list of fastq files and sample ID from input folder")
        fastqs = []
        for file in os.listdir(options.dataDir):
            if \
                    file.endswith(".fastq.gz") or \
                    file.endswith(".bz2") or \
                    file.endswith(".fastq"):
                fastqs.append(file)

        if len(fastqs) % 2 != 0:
            logger.error("There are missing pair(s) in the Illumina sequences. Check your files and labelling")
            exit(1)

        coin = True
        for fastq in sorted(fastqs):
            if coin == True:
                fastqs_f.append(fastq)
            else:
                fastqs_r.append(fastq)
            coin = not coin

        for i in range(len(fastqs_f)):
            if fastqs_f[i].split("_")[0] != fastqs_r[i].split("_")[0]:
                logger.error("Problem with labelling FASTQ files.")
                exit(1)
            fastqs_l.append(fastqs_f[i].split("_")[0])


    # Check
    if len(fastqs_f) != len(fastqs_r):
        logger.error("Different number of forward FASTQs and reverse FASTQs")
        exit(1)


    # Done loading. Now check the file extensions.
    filenameextensions = []
    for filename in (fastqs_f + fastqs_r):
        filenameextensions.append(filename.split(".")[-1].rstrip())
    if len(set(filenameextensions)) > 1:
        logger.error("More than two types of extensions")
        exit(1)
    extensionType = next(iter(filenameextensions))


    # For summary 1:
    logger.info("Counting sequences in rawdata")
    numberofsequences = 0
    for fr in fastqs_f:
        if extensionType == "gz":
            cmd = " ".join(["zcat", options.dataDir + "/" + fr, "|", "wc -l"])
        elif extensionType =="bz2":
            cmd = " ".join(["bzcat", options.dataDir + "/" + fr, "|", "wc -l"])
        elif extensionType =="fastq":
            cmd = " ".join(["cat", options.dataDir + "/" + fr, "|", "wc -l"])
        else:
            logger.error("Unknown extension type.")
            exit(1)

        logger.debug(cmd)
        p = subprocess.Popen(cmd, shell=True, stdout = subprocess.PIPE)
        numberofsequences += int(p.communicate()[0]) / 4
        p.wait()
    logger.info("\t" + tc.RED + "Number of paired-end reads in rawdata: " + str(numberofsequences) + tc.ENDC)
    summary_file.write("Number of paired-end reads in rawdata: " + str(numberofsequences) + "\n")


    # Join paired-end reads                                                                                                                                                             
    logger.info("Joining paired-end reads" + "[" + options.joiner_method + "]")
    if not os.path.exists(tmpDir + "/joined"):
        os.mkdir(tmpDir + "/joined")

    for i in range(len(fastqs_l)):

        if extensionType == "gz":
            cmd = " ".join(["gunzip -c", options.dataDir + "/" + fastqs_f[i], ">", tmpDir + "/joined/" + fastqs_f[i] + ".tmp"])
            rc.run_cmd(cmd, logger, options.verbose)
            cmd = " ".join(["gunzip -c", options.dataDir + "/" + fastqs_r[i], ">", tmpDir + "/joined/" + fastqs_r[i] + ".tmp"])
            rc.run_cmd(cmd, logger, options.verbose)
        elif extensionType == "bz2":
            cmd = " ".join(["bunzip2 -c", options.dataDir + "/" + fastqs_f[i], ">", tmpDir + "/joined/" + fastqs_f[i] + ".tmp"])
            rc.run_cmd(cmd, logger, options.verbose)
            cmd = " ".join(["bunzip2 -c", options.dataDir + "/" + fastqs_r[i], ">", tmpDir + "/joined/" + fastqs_r[i] + ".tmp"])
            rc.run_cmd(cmd, logger, options.verbose)
        elif extensionType == "fastq":
            cmd = " ".join(["ln -sf", 
                            os.path.abspath(options.dataDir + "/" + fastqs_f[i]), 
                            tmpDir + "/joined/" + fastqs_f[i] + ".tmp"])
            rc.run_cmd(cmd, logger, options.verbose)
            cmd = " ".join(["ln -sf",
                            os.path.abspath(options.dataDir + "/" + fastqs_r[i]),
                            tmpDir + "/joined/" + fastqs_r[i] + ".tmp"])
            rc.run_cmd(cmd, logger, options.verbose)
        else:
            print(extensionType)
            logger.error("Unknown extension found.")
            exit(1)
        
#        joiner_method = "PEAR"

        if options.joiner_method == "PEAR":
            cmd = " ".join([pd.PEAR,
                            "-f", tmpDir + "/joined/" + fastqs_f[i] + ".tmp",
                            "-r", tmpDir + "/joined/" + fastqs_r[i] + ".tmp",
                            "-o", tmpDir + "/joined/" + fastqs_l[i],
                            "-j", options.threads,
                            "-b", options.base_phred_quality_score,
                            "-q 30",
                            "-p 0.0001"])
            rc.run_cmd(cmd, logger, options.verbose)

            cmd = " ".join(["rm -v",
                            tmpDir + "/joined/" + fastqs_f[i] + ".tmp",
                            tmpDir + "/joined/" + fastqs_r[i] + ".tmp"])
            rc.run_cmd(cmd, logger, options.verbose)

            cmd = " ".join(["mv -f", 
                            tmpDir + "/joined/" + fastqs_l[i] + ".assembled.fastq", 
                            tmpDir + "/joined/" + fastqs_l[i] + ".joined.fastq"])
            rc.run_cmd(cmd, logger, options.verbose)

        elif options.joiner_method == "FASTQJOIN":
            cmd = " ".join(["fastq-join",
                            tmpDir + "/joined/" + fastqs_f[i] + ".tmp",
                            tmpDir + "/joined/" + fastqs_r[i] + ".tmp",
                            "-o",
                            tmpDir + "/joined/" + fastqs_l[i] + ".joined.fastq"])
            rc.run_cmd(cmd, logger, options.verbose)

            cmd = " ".join(["mv -f",
                            tmpDir + "/joined/" + fastqs_l[i] + ".joined.fastqjoin",
                            tmpDir + "/joined/"+ fastqs_l[i] +".joined.fastq"])
            rc.run_cmd(cmd, logger, options.verbose)


    # For summary 2:
    numberofsequences = 0
    for i in range(len(fastqs_l)):
        cmd = " ".join(["cat", tmpDir + "/joined/" + fastqs_l[i] + ".joined.fastq", "|", "wc -l"])
        logger.debug(cmd)
        p = subprocess.Popen(cmd, shell=True, stdout = subprocess.PIPE)
        numberofsequences += int(p.communicate()[0]) / 4
        p.wait()
    logger.info("\t" + tc.RED + "Number of joined reads: " + str(numberofsequences) + tc.ENDC)
    summary_file.write("Number of joined reads: " + str(numberofsequences) + "\n")

    # Quality filter
    logger.info("Quality filtering [FASTX]")
    if not os.path.exists(tmpDir + "/fastqqualityfiltered"):
        os.mkdir(tmpDir + "/fastqqualityfiltered")

    for i in range(len(fastqs_f)):
        cmd = " ".join([pd.FASTX_FASTQ_QUALITY_FILTER,
                        "-i", tmpDir + "/joined/" + fastqs_l[i] + ".joined.fastq", 
                        "-o", tmpDir + "/fastqqualityfiltered/" + fastqs_l[i] + ".fastq", 
                        "-q", options.FASTX_fastq_quality_filter_q,
                        "-p", options.FASTX_fastq_quality_filter_p,
                        "-Q" + options.base_phred_quality_score])
        rc.run_cmd(cmd, logger, options.verbose)


    # For summary 3:
    numberofsequences = 0
    for i in range(len(fastqs_l)):
        cmd = " ".join(["cat", tmpDir + "/fastqqualityfiltered/" + fastqs_l[i] + ".fastq", "|", "wc -l"])
        p = subprocess.Popen(cmd, shell=True, stdout = subprocess.PIPE)
        numberofsequences += int(p.communicate()[0]) / 4
        p.wait()
    logger.info("\t" + tc.RED + "Number of quality filtered reads: " + str(numberofsequences) + tc.ENDC)
    summary_file.write("Number of quality filtered reads: " + str(numberofsequences) + "\n")


    # Removing reads with \"N\" and FASTA conversion
    if options.FASTX_fastq_to_fasta_n:
        logger.info("Converting FASTQ to FASTA [FASTX]")
    else:
        logger.info("Converting FASTQ to FASTA and also removing reads with \"N\" nucleotide [FASTX]")

    if not os.path.exists(tmpDir + "/fastqtofasta"):
        os.mkdir(tmpDir + "/fastqtofasta")

    fastq_to_fasta_n = ""
    if options.FASTX_fastq_to_fasta_n:
        fastq_to_fasta_n = "-n"

    for i in range(len(fastqs_f)):
        cmd = " ".join([pd.FASTX_FASTQ_TO_FASTA, 
                        "-i", tmpDir + "/fastqqualityfiltered/" + fastqs_l[i] + ".fastq", 
                        "-o", tmpDir + "/fastqtofasta/" + fastqs_l[i] + ".fasta", 
                        "-Q33",
                        fastq_to_fasta_n])
        rc.run_cmd(cmd, logger, options.verbose)


    # For summary 3:
    numberofsequences = 0
    for i in range(len(fastqs_l)):
        cmd = " ".join(["grep \"^>\"", tmpDir + "/fastqtofasta/" + fastqs_l[i] + ".fasta", "|", "wc -l"])
        p = subprocess.Popen(cmd, shell=True, stdout = subprocess.PIPE)
        numberofsequences += int(p.communicate()[0])
        p.wait()
    logger.info("\t" + tc.RED + "Number of N-less quality filtered sequences: " + str(numberofsequences) + tc.ENDC)
    summary_file.write("Number of N-less quality filtered sequences: " + str(numberofsequences) + "\n")


    # Re-ID and re-index FASTA and merging them all
    logger.info("Re-IDing and indexing FASTA, and merging all into a single file")
    outfileFinalFASTA = open(options.outDir + "/" + PIPITS_PREP_OUTPUT, "w")
    for i in range(len(fastqs_f)):
        line_index = 1
        logger.debug("Reading " + tmpDir + "/fastqtofasta/" + fastqs_l[i] + ".fasta")
        infile_fasta = open(tmpDir + "/fastqtofasta/" + fastqs_l[i] + ".fasta")
        for line in infile_fasta:
            if line.startswith(">"):
                outfileFinalFASTA.write(">" + fastqs_l[i] + "_" + str(line_index) + "\n")
                line_index += 1
            else:
                outfileFinalFASTA.write(line.rstrip() + "\n")
    outfileFinalFASTA.close()


    # Clean up tmp_directory
    if options.remove:
        logger.info("Cleaning temporary directory")
        shutil.rmtree(tmpDir)


    logger.info(tc.OKBLUE + "PIPITS PREP ended successfully. \"" + PIPITS_PREP_OUTPUT + "\" created in \"" + options.outDir + "\"" + tc.ENDC)
    logger.info(tc.OKYELLOW + "Next Step: PIPITS FUNITS [ Suggestion: pipits_funits -i " + options.outDir + "/" + PIPITS_PREP_OUTPUT + " -o out_funits -x YOUR_ITS_SUBREGION ]" + tc.ENDC)
    print("")
    summary_file.close()
Beispiel #3
0
def run(options):

    # Check file exists
    if not os.path.exists(options.input):
        print("Error: Input file doesn't exist")
        exit(1)


    EXE_DIR = os.path.dirname(os.path.realpath(__file__))
    if not os.path.exists(options.outDir):
        os.mkdir(options.outDir)
    else:
        shutil.rmtree(options.outDir)
        os.mkdir(options.outDir)
    tmpDir = options.outDir + "/intermediate"
    if not os.path.exists(tmpDir):
        os.mkdir(tmpDir)


    # Logging
    import logging
    logger = logging.getLogger("pipits_process")
    logger.setLevel(logging.DEBUG)

    streamLoggerFormatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s", tc.HEADER + "%Y-%m-%d %H:%M:%S" + tc.ENDC)

    streamLogger = logging.StreamHandler()
    if options.verbose:
        streamLogger.setLevel(logging.DEBUG)
    else:
        streamLogger.setLevel(logging.INFO)
    streamLogger.setFormatter(streamLoggerFormatter)
    logger.addHandler(streamLogger)

    fileLoggerFormatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s", tc.HEADER + "%Y-%m-%d %H:%M:%S" + tc.ENDC)
    fileLogger = logging.FileHandler(options.outDir + "/log.txt", "w")
    fileLogger.setLevel(logging.DEBUG)
    fileLogger.setFormatter(fileLoggerFormatter)
    logger.addHandler(fileLogger)

    # Summary file
    #summary_file = open(options.outDir + "/summary_pipits_process.txt", "w")

    # Start
    logger.info(tc.OKBLUE + "PIPITS PROCESS started" + tc.ENDC)


    # Check if the file is empty
    if os.stat(options.input).st_size == 0:
        logger.error("Input file is empty!")
        exit(0)
        
    # Derep with sgtk
    logger.info("Dereplicating and removing unique sequences prior to picking OTUs")
    cmd = " ".join([pd.VSEARCH, "--derep_fulllength", options.input, 
                    "--output", tmpDir + "/input_nr.fasta", 
                    "--minuniquesize 2", 
                    "--sizeout",
                    "--threads", options.threads])
    rc.run_cmd_VSEARCH(cmd, logger, options.verbose)
    #filesize = os.path.getsize(tmpDir + "/input_nr.fasta") / 1000.0
    #logger.info("Dereplicating " + tc.OKGREEN + "(Done) " + tc.ENDC)
    #logger.info("\t" + tc.RED + "File size after initial dereplication: " + str(filesize) + " MB" + tc.ENDC)

    # Check if the file is empty
    if os.stat(tmpDir + "/input_nr.fasta").st_size == 0:
        logger.info(tc.OKYELLOW + "After dereplicating and removing unique sequences, there aren't no sequences! Processing stopped." + tc.ENDC)
        exit(0)


    # OTU clustering
    logger.info("Picking OTUs [VSEARCH]")
    cmd = " ".join([pd.VSEARCH, 
                    "--cluster_fast", tmpDir + "/input_nr.fasta", 
                    "--id", options.VSEARCH_id,
                    "--centroids", tmpDir + "/input_nr_otus.fasta",
                    "--uc", tmpDir + "/input_nr_otus.uc",
                    "--threads", options.threads])
    rc.run_cmd_VSEARCH(cmd, logger, options.verbose)


    # Chimera removal
    logger.info("Removing chimeras [VSEARCH]")
    cmd = " ".join([pd.VSEARCH, 
                    "--uchime_ref", tmpDir + "/input_nr_otus.fasta", 
                    "--db", pd.UNITE_REFERENCE_DATA_CHIMERA, 
                    "--nonchimeras", tmpDir + "/input_nr_otus_nonchimeras.fasta",
                    "--threads", options.threads])
    rc.run_cmd_VSEARCH(cmd, logger, options.verbose)


    # Rename OTUs
    logger.info("Renaming OTUs")
    def renumberOTUS():
        handle_in = open(tmpDir + "/input_nr_otus_nonchimeras.fasta", "rU")
        handle_out = open(tmpDir + "/input_nr_otus_nonchimeras_relabelled.fasta", "w")
        for line in handle_in:
            if line.startswith(">"):
                newlabel = line[1:].split(";")[0]
                handle_out.write(">" + newlabel + "\n")
            else:
                handle_out.write(line.rstrip() + "\n")
        handle_in.close()
        handle_out.close()
    renumberOTUS()


    # Map reads to OTUs
    logger.info("Mapping reads onto centroids [VSEARCH]")
    cmd = " ".join([pd.VSEARCH, 
                    "--usearch_global", options.input, 
                    "--db", tmpDir + "/input_nr_otus_nonchimeras_relabelled.fasta", 
                    "--id", options.VSEARCH_id, 
                    "--uc", tmpDir + "/otus.uc",
                    "--threads", options.threads])
    rc.run_cmd_VSEARCH(cmd, logger, options.verbose)


    # OTU construction
    logger.info("Making OTU table")
    cmd = " ".join(["python", EXE_DIR + "/pipits_uc/uc2otutab.py", tmpDir + "/otus.uc", 
                    ">", 
                    tmpDir + "/otu_table_prelim.txt"])
    rc.run_cmd_VSEARCH(cmd, logger, options.verbose)


    # Convert to biom
    logger.info("Converting classic tabular OTU into a BIOM format [BIOM]")
    try:
        os.remove(tmpDir + "/otu_table_prelim.biom")
    except OSError:
        pass
    cmd = " ".join([pd.BIOM, "convert", 
                    "-i", tmpDir + "/otu_table_prelim.txt", 
                    "-o", tmpDir + "/otu_table_prelim.biom", 
                    "--table-type=\"OTU table\""])
    rc.run_cmd(cmd, logger, options.verbose)


    # Classifying OTUs
    # http://sourceforge.net/projects/rdp-classifier/files/RDP_Classifier_TrainingData/ 
    logger.info("Assigning taxonomy [RDP Classifier]")
    cmd = " ".join(["java", "-jar", pd.RDP_CLASSIFIER_JAR, "classify", 
                    "-t", pd.UNITE_RETRAINED_DIR + "/rRNAClassifier.properties", 
                    "-o", options.outDir + "/assigned_taxonomy.txt", 
                    tmpDir + "/input_nr_otus_nonchimeras_relabelled.fasta"])
    rc.run_cmd(cmd, logger, options.verbose)


    # Reformatting RDP_CLASSIFIER output for biom
    logger.info("Reformatting RDP_Classifier output")
    cmd = " ".join(["python", EXE_DIR + "/reformatAssignedTaxonomy.py", 
                    "-i", options.outDir + "/assigned_taxonomy.txt" , 
                    "-o", options.outDir + "/assigned_taxonomy_reformatted_filtered.txt",
                    "-c", options.RDP_assignment_threshold])
    rc.run_cmd(cmd, logger, options.verbose)


    # Adding RDP_CLASSIFIER output to OTU table
    logger.info("Adding assignment to OTU table [BIOM]")
    try:
            os.remove(options.outDir + "/otu_table.biom")
    except OSError:
            pass
    cmd = " ".join([pd.BIOM, "add-metadata", 
                    "-i", tmpDir + "/otu_table_prelim.biom", 
                    "-o", options.outDir + "/otu_table.biom", 
                    "--observation-metadata-fp", options.outDir + "/assigned_taxonomy_reformatted_filtered.txt", 
                    "--observation-header", "OTUID,taxonomy,confidence", 
                    "--sc-separated", "taxonomy", 
                    "--float-fields", "confidence"])
    rc.run_cmd(cmd, logger, options.verbose)


    # Convert BIOM to TABLE
    logger.info("Converting OTU table with taxa assignment into a BIOM format [BIOM]")
    try:
        os.remove(options.outDir + "/otu_table.txt")
    except OSError:
        pass
    cmd = " ".join([pd.BIOM, "convert", 
                    "-i", options.outDir + "/otu_table.biom", 
                    "-o", options.outDir + "/otu_table.txt", 
                    "--header-key taxonomy",  
                    "-b"])
    rc.run_cmd(cmd, logger, options.verbose)


    # Make phylotyp table
    logger.info("Phylotyping OTU table")
    cmd = " ".join(["python", EXE_DIR + "/phylotype_biom.py", "-i", options.outDir + "/otu_table.biom", "-o", options.outDir + "/phylotype_table.txt"])
    rc.run_cmd(cmd, logger, options.verbose)

    try:
        os.remove(options.outDir + "/phylotype_table.biom")
    except OSError:
        pass
    cmd = " ".join([pd.BIOM, "convert",
                    "-i", options.outDir + "/phylotype_table.txt",
                    "-o", options.outDir + "/phylotype_table.biom",
                    "--table-type=\"OTU table\" --process-obs-metadata=\"taxonomy\""])
    rc.run_cmd(cmd, logger, options.verbose)


    # Move representative sequence file to outDir
    shutil.move(tmpDir + "/input_nr_otus_nonchimeras_relabelled.fasta", options.outDir + "/repseqs.fasta")


    # Remove tmp
    if options.remove:
        logger.info("Cleaning temporary directory")
        shutil.rmtree(tmpDir)


    # Final stats

    #############################
    # Import json formatted OTU #
    #############################

    def biomstats(BIOMFILE):
        import json
        jsondata = open(BIOMFILE)
        biom = json.load(jsondata)

        sampleSize = int(biom["shape"][1])
        otus = int(biom["shape"][0])

        taxonomies = []
        for i in range(len(biom["rows"])):
            taxonomies.append("; ".join(biom["rows"][i]["metadata"]["taxonomy"]))

        sampleids = []
        for i in range(len(biom["columns"])):
            sampleids.append(biom["columns"][i]["id"])

        import numpy as np

        # BIOM table into matrix
        matrix = np.zeros(shape=(otus, sampleSize))
        for i in biom["data"]:
            matrix[i[0], i[1]] = i[2]
        totalCount = matrix.sum()

        return totalCount, otus, sampleSize

    otu_reads_count, otu_count, otu_sample_count = biomstats(options.outDir + "/otu_table.biom")
    phylo_reads_count, phylo_count, phylo_sample_count = biomstats(options.outDir + "/phylotype_table.biom")

    outfile = open(options.outDir + "/summary_pipits_process.txt", "w")

    outfile.write("No.of reads after singletons and chimera removal: " + str(int(otu_reads_count)) + "\n")
    outfile.write("Number of OTUs:                                   " + str(otu_count) + "\n")
    outfile.write("Number of phylotypes:                             " + str(phylo_count) + "\n")
    outfile.write("Number of samples:                                " + str(otu_sample_count) + "\n")

    logger.info(tc.RED + "\tNumber of reads after singletons and chimera removal: " + str(int(otu_reads_count)) + tc.ENDC)
    logger.info(tc.RED + "\tNumber of OTUs:                                       " + str(otu_count) + tc.ENDC)
    logger.info(tc.RED + "\tNumber of phylotypes:                                 " + str(phylo_count) + tc.ENDC)
    logger.info(tc.RED + "\tNumber of samples:                                    " + str(otu_sample_count) + tc.ENDC)


    # Done!
    logger.info(tc.OKBLUE + "PIPITS_PROCESS ended successfully." + tc.ENDC)
    logger.info(tc.OKYELLOW + "Resulting files are in \"" + options.outDir + "\" directory" + tc.ENDC)