Example #1
0
    def demux_fastx(self, input_f, barcodes, outdir, processes, extraargstring):
        """Demuxes using FAST X BARCODE SPLITTER.

        :param input_f: File path to input file or folder of input files.
        :param barcodes: File path to input barcodes file.
        :param outdir: Filepath to output directory.
        :param processes: Number of processes to use to demux input fileset.
        :param extraargstring: Advanced program parameter string.
        """
        # Get input files
        files_to_split = getInputFiles(input_f)
        # Assign the files shard numbers
        file_id = range(len(files_to_split))
        file_id_pairs = zip(files_to_split, file_id)
        debugPrintInputInfo(files_to_split, "demux")
        pool = init_pool(min(len(file_id_pairs), processes))

        printVerbose("Demuxing sequences...")
        run_parallel([ProgramRunner(ProgramRunnerCommands.DEMUX_FASTX,
                                    [input_, barcodes, "%s/" % outdir, "_%d_demux.fastq" % id_],
                                    {"exists": [input_, barcodes]}, extraargstring)
                      for input_, id_ in file_id_pairs], pool)
        printVerbose("Demuxed sequences.")

        # Grab all the auxillary files
        aux_files = getInputFiles(outdir, "unmatched_*", ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)
    def clean_quality_trimmomatic(self, input_f, outdir, window_size, quality,
                                  min_len, processes, extraargstring):
        """Uses a sliding window to identify and trim away areas of low quality.

        :param input_f: Filepath to input file or folder.
        :param outdir: Filepath to the output directory.
        :param window_size: Width of the sliding window. (Number of consecutive base-pairs to average for quality \
                            analysis).
        :param quality: Minimum quality allowed.  Sections with lower average quality than this will be dropped.
        :param min_len: Minimum allowed length for TRIMMED sequences.  (i.e. if a sequence is too short after trimming,
                        its dropped.)
        :param processes: Number of processes to use to clean the input fileset.
        """
        # "trimomatic":       "java -jar ~/ARMS/programs/Trimmomatic-0.33/trimmomatic-0.33.jar SE \
        # -%phred %input %output SLIDINGWINDOW:%windowsize:%minAvgQuality MINLEN:%minLen"

        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clean")
        pool = init_pool(min(len(inputs), processes))

        printVerbose("Cleaning sequences with Trimmomatic...")
        run_parallel([
            ProgramRunner(
                ProgramRunnerCommands.CLEAN_TRIMMOMATIC, [
                    input_,
                    "%s/%s_cleaned.fastq" %
                    (outdir, strip_ixes(input_)), window_size, quality, min_len
                ], {
                    "exists": [outdir, input_],
                    "positive": [window_size, quality, min_len]
                }, extraargstring) for input_ in inputs
        ], pool)
        printVerbose("Done cleaning sequences.")
        cleanup_pool(pool)
    def assemble_pear(self, input_f, input_r, outdir, name, processes, pearthreads, extraargstring):
        """Uses PEAR to assemble paired F/R read files in run_parallel.

        :param input_f: File path to forward Fastq Reads file or folder.
        :param input_r: File path to reverse Fastq Reads file or folder.
        :param outdir: File path to the output directory.
        :param name: File prefix for the assembled reads.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        :param pearthreads: The number of threads per process to use.
        """
        # "~/programs/pear-0.9.4-bin-64/pear-0.9.4-64 -f %s -r %s -o %s -j %s -m %d"
        inputs = validate_paired_fastq_reads(input_f, input_r)
        pool = init_pool(min(len(inputs), processes))
        printVerbose("\tAssembling reads with pear")
        debugPrintInputInfo(inputs, "assemble")
        run_parallel([ProgramRunner(ProgramRunnerCommands.ASSEMBLE_PEAR,
                                      [forwards, reverse, "%s/%s_%s" % ( outdir, name, getFileName(forwards)),
                                        pearthreads],
                                      {"exists": [forwards, reverse], "positive": [pearthreads]},
                                      extraargstring)
                        for forwards, reverse in inputs], pool)

        printVerbose("Done assembling sequences...")
        # Grab all the auxillary files (everything not containing ".assembled."
        aux_files = getInputFiles(outdir, "*", "*.assembled.*", ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)
def parseVSearchOutputAgainstFasta(vsearch_outfile, taxInfo, output_file,
                                   min_simmilarity, min_coverage):
    """Resolves vsearch matches in a vsearch output file to the taxonomic name taken from BIOCODE.
        Takes in a vsearch output file from usearch__global, parses the result for good matches, and
        writes an output file mapping sequence name to taxa name.

    :param vsearch_out: An output file from vsearch's usearch__global program.
    :param taxInfo: A two column tabular file mapping BIOCODE sequence names to taxonomic identifier strings.
    :param output_file: Where to write the resulting file that maps sequence ID to taxanomic name.
    :param min_coverage: The minimum coverage for an acceptable vsearch match.
    :param min_similarity: The minimum simmilarity for an acceptable vsearch match.
    """
    printVerbose("Parsing Vsearch Output")
    min_simm = float(min_simmilarity)
    min_coverage = float(min_coverage)
    biocodeTax = buildTaxaDict(taxInfo)
    printVerbose("Constructed identity dictionary with %d entries from %s." %
                 (len(biocodeTax), taxInfo))
    rslt = []
    with open(output_file, 'w') as out:
        printVerbose("Reading %s as query file..." % vsearch_outfile)
        for line in open(vsearch_outfile, 'r'):
            data = line.split()
            if float(data[2]) > min_simm and float(data[4]) > min_coverage:

                if biocodeTax.has_key(data[1]):
                    printVerbose("Found %s as %s" %
                                 (data[1], biocodeTax[data[1]]))
                    data.append(biocodeTax[data[1]])
                    rslt.append("\t".join(data))
                else:
                    printErrorMissingID(out, data[1])
        out.write("\n".join(rslt))
    printVerbose("Wrote %d identified sequences to %s" %
                 (len(rslt), output_file))
    def demux_by_name(self, input_f, barcodes, outdir, filetype, processes, extraargstring):
        """Demuxes using SeqIO.

        :param input_f: File path to input file or folder of input files.
        :param barcodes: File path to input barcodes file.
        :param outdir: Filepath to output directory.
        :param filetype: Either 'fasta' or 'fastq'.
        :param processes: Number of processes to use to demux input fileset.
        :param extraargstring: Advanced program parameter string.
        """
        aux_dir = makeAuxDir(outdir)
        # Get input files
        files_to_split = getInputFiles(input_f)
        # Assign the files shard numbers
        file_id = range(len(files_to_split))
        file_id_pairs = zip(files_to_split, file_id)
        debugPrintInputInfo(files_to_split, "demux")
        pool = init_pool(min(len(file_id_pairs), processes))

        printVerbose("Demuxing sequences...")
        run_parallel([PythonRunner(split_on_name,
                                   [input_, barcodes, outdir, id_, filetype], {"exists": [input_]})
                        for input_, id_ in file_id_pairs], pool)


        # Grab all the auxillary files
        aux_files = getInputFiles(outdir, "unmatched_*", ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        bulk_move_to_dir(aux_files, aux_dir)
        cleanup_pool(pool)
def parseVSearchOutputAgainstFasta(vsearch_outfile, taxInfo, output_file, min_simmilarity, min_coverage):
    """Resolves vsearch matches in a vsearch output file to the taxonomic name taken from BIOCODE.
        Takes in a vsearch output file from usearch__global, parses the result for good matches, and
        writes an output file mapping sequence name to taxa name.

    :param vsearch_out: An output file from vsearch's usearch__global program.
    :param taxInfo: A two column tabular file mapping BIOCODE sequence names to taxonomic identifier strings.
    :param output_file: Where to write the resulting file that maps sequence ID to taxanomic name.
    :param min_coverage: The minimum coverage for an acceptable vsearch match.
    :param min_similarity: The minimum simmilarity for an acceptable vsearch match.
    """
    printVerbose("Parsing Vsearch Output")
    min_simm = float(min_simmilarity)
    min_coverage = float(min_coverage)
    biocodeTax = buildTaxaDict(taxInfo)
    printVerbose("Constructed identity dictionary with %d entries from %s." % (len(biocodeTax), taxInfo))
    rslt = []
    with open(output_file, 'w') as out:
        printVerbose("Reading %s as query file..." % vsearch_outfile)
        for line in open(vsearch_outfile, 'r'):
            data = line.split()
            if float(data[2]) > min_simm and float(data[4]) > min_coverage:

                if biocodeTax.has_key(data[1]):
                    printVerbose("Found %s as %s" % (data[1], biocodeTax[data[1]]))
                    data.append(biocodeTax[data[1]])
                    rslt.append("\t".join(data))
                else:
                    printErrorMissingID(out, data[1])
        out.write("\n".join(rslt))
    printVerbose("Wrote %d identified sequences to %s" % (len(rslt), output_file))
    def rename_chewbacca(self, input_f, outdir, filetype, clip, processes):
        """Renames sequences in a fasta/fastq file as <filename>_ID0, <filename>_ID1, <filename>_ID2, etc., where
            <filename> is the name of the fasta/fastq file without any extensions or chewbacca suffixes.

        :param input_f: Filepath to an input file or folder to rename.
        :param outdir: Filepath to the output directory.
        :param filetype: Either 'fasta' or 'fastq'.
        :param clip: If True, remove dereplication counts from sequence names before renaming.
        :param processes: The maximum number of processes to use.
        """

        # Gather input files
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "rename")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Renaming sequences...")
        # Run serialRename in run_parallel
        run_parallel([PythonRunner(serialRename,
                                   [input_,
                                    "%s/%s_renamed%s" % (outdir, strip_ixes(input_), os.path.splitext(input_)[1]),
                                    filetype, clip], {"exists": [input_]})
                      for input_ in inputs], pool)
        printVerbose("Done renaming sequences...")

        samples_dir = makeDirOrdie("%s_samples" % outdir)
        samples_files = getInputFiles(outdir, "*.samples", ignore_empty_files=False)
        bulk_move_to_dir(samples_files, samples_dir)

        aux_dir = makeAuxDir(outdir)
        aux_files = getInputFiles(outdir, "*.mapping", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)

        cleanup_pool(pool)
Example #8
0
File: arms.py Project: mahdi-b/ARMS
def main(argv):
    parser = argparse.ArgumentParser(description="arms description", epilog="arms long description")
    parser.add_argument('-v', '--version', action='version', version='%(prog)s '+version)
    parser.add_argument("--verbose", default=True, help="increase output verbosity", action="store_true")   
    parser.add_argument('-t', '--threads', type=int, default = 1)
    parser.add_argument('--dryRun', action='store_true', default = False)
    subparsers = parser.add_subparsers(dest='action', help='Available commands')

    # preprocess data
    parser_preprocess = subparsers.add_parser('preprocess')
    parser_preprocess.add_argument('-n', '--name', required=True, help="Run Id")
    parser_preprocess.add_argument('-f', '--input_f', required=True, help="Forward Fastq Reads")
    parser_preprocess.add_argument('-r', '--input_r', required=True, help="Reverse Fastq Reads")
    parser_preprocess.add_argument('-b', '--barcodes', required=True, help="Tab delimted files of barcodes and their samples")
    parser_preprocess.add_argument('-o', '--outDir', required=True, help="Directory where outputs will be saved")
    parser_preprocess.add_argument('-d', '--db', required=True, help="Db against which the seqeunces are aligned")

    parser_preprocess.set_defaults(func=preprocessData)
    
    global args
    args = parser.parse_args()
    if args.verbose:
        logging.basicConfig(format=FORMAT, level=logging.DEBUG)
    else:
        logging.basicConfig(format=FORMAT, level=logging.ERROR)

    printVerbose.VERBOSE = args.verbose
    printVerbose("Running with %s threads" % args.threads)
    pool = Pool(processes=args.threads)
    logging.debug("Initial ARGS are:")   
    logging.debug(args)
    args.func(args, pool)   
Example #9
0
def query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string,
                  extraargstring, pool):
    """Runs a VSEARCH alignment on pairs of query/reference sequences.

    :param inputs: A list of pairs of (filepaths to) query_fastas and the refrence fastas to compare them to.
    :param outdir: Filepath to the directory where the alignment result should be written.
    :param aln_user_string: An optional string of commandline parameters passed to the VSEARCH program.
    :param simmilarity: The minimum simmilarity percentage (between reference and query sequences), \
                            as a decimal between 0 and 1), required for a positive  match.
    :param processes: The number of processes to use in the identification process.
    :param extraargstring: Advanced program parameter string.
    :param pool: A fully initalized multiprocessing.Pool object.
    """
    printVerbose("Aligning against reference sequences...")
    #     # vsearch --usearch_global %s seeds.pick.fasta  --db ../data/BiocodePASSED_SAP.txt --id 0.9 \
    # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt
    run_parallel([
        ProgramRunner(ProgramRunnerCommands.ALIGN_VSEARCH, [
            processes, query_fasta, ref_fasta, simmilarity,
            "%s/%s.out" % (outdir, strip_ixes(query_fasta)),
            "%s/%s.alnout" % (outdir, strip_ixes(query_fasta)), aln_user_string
        ], {
            "exists": [query_fasta, ref_fasta],
            "positive": [processes]
        }, extraargstring) for query_fasta, ref_fasta in inputs
    ], pool)
    printVerbose("Done aligning.")
    return
    def annotate_otu_chewbacca(self, input_f, outdir, annotation, processes):
        """Annotates an OTU table.

        :param input_f: Filepath to a file or folder of files to annotate.
        :param annotation: Filepath to a file or a folder of files to use as annotations.
        :param outdir: Filepath to the output directory where annotated files will be written.
        :param procs: The maximum number of procs to use.
        """
        # matrixes = getInputFiles(input_f)
        matricies = getInputFiles(input_f)
        debugPrintInputInfo(matricies, "annotated.")
        annotations = getInputFiles(annotation)

        
        # if all the annotations files are empty, just copy over files.
        if len(annotations) == 0 and len(getInputFiles(annotation, ignore_empty_files=False)) > 0:
            pool = init_pool(min(len(matricies), processes))
            print "**WARNING**: Annotation File is empty.  Skipping annotation and copying old OTU tables to output \
                    directory.\n"
            run_parallel([PythonRunner(copy_file, [matrix, outdir],
                                       {"exists": [matrix]}) for matrix in matricies], pool)
        else:
            pool = init_pool(min(len(matricies) * len(annotations), processes))
            debugPrintInputInfo(annotations, "parsed.")
            inputs = product(matricies, annotations)

            printVerbose("Annotating matrix...")

            
            run_parallel([PythonRunner(annotateOTUtable, [matrix, annotation, "%s/%s.txt" % (outdir, "matrix")],
                                       {"exists": [matrix, annotation]})
                          for matrix, annotation in inputs], pool)
            printVerbose("Done Annotating.")

        cleanup_pool(pool)
    def clean_quality_trimmomatic(self, input_f, outdir, window_size, quality, min_len, processes, extraargstring):
        """Uses a sliding window to identify and trim away areas of low quality.

        :param input_f: Filepath to input file or folder.
        :param outdir: Filepath to the output directory.
        :param window_size: Width of the sliding window. (Number of consecutive base-pairs to average for quality \
                            analysis).
        :param quality: Minimum quality allowed.  Sections with lower average quality than this will be dropped.
        :param min_len: Minimum allowed length for TRIMMED sequences.  (i.e. if a sequence is too short after trimming,
                        its dropped.)
        :param processes: Number of processes to use to clean the input fileset.
        """
        # "trimomatic":       "java -jar ~/ARMS/programs/Trimmomatic-0.33/trimmomatic-0.33.jar SE \
        # -%phred %input %output SLIDINGWINDOW:%windowsize:%minAvgQuality MINLEN:%minLen"

        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clean")
        pool = init_pool(min(len(inputs), processes))

        printVerbose("Cleaning sequences with Trimmomatic...")
        run_parallel([ProgramRunner(ProgramRunnerCommands.CLEAN_TRIMMOMATIC,
                                    [input_, "%s/%s_cleaned.fastq" % (outdir, strip_ixes(input_)), window_size, quality,
                                     min_len],
                                    {"exists": [outdir, input_], "positive": [window_size, quality, min_len]},
                                    extraargstring)
                      for input_ in inputs], pool)
        printVerbose("Done cleaning sequences.")
        cleanup_pool(pool)
Example #12
0
    def preclean_bayeshammer(self, input_f, input_r, outdir, processes,
                             bayesthreads, extraargstring):
        """Assembles reads from two (left and right) fastq files/directories.

        :param input_f: File path to file or folder of left reads to clean.
        :param input_r: File path to file or folder of right reads to clean.
        :param outdir: Filepath to output directory.
        :param bayesthreads: The number of threads per process to use.
        :param processes: The maximum number of processes to use.
        :param kmerlen: The kmer length to use.  Default: 16.
        :param extraargstring: Advanced program parameter string.
        """
        # Collect input files, and validate that they match
        inputs = validate_paired_fastq_reads(input_f, input_r)
        pool = init_pool(min(len(inputs), processes))
        printVerbose("\tPrecleaning %s reads with Spades-Baye's Hammer..." %
                     len(inputs))
        debugPrintInputInfo(inputs, "preclean/fix.")

        run_parallel([
            ProgramRunner(ProgramRunnerCommands.PRECLEAN_SPADES,
                          [forwards, reverse, outdir, bayesthreads], {
                              "exists": [forwards, reverse],
                              "positive": [bayesthreads]
                          }, extraargstring) for forwards, reverse in inputs
        ], pool)
        printVerbose("Done cleaning reads.")

        # Grab all the auxillary files (everything not containing ".assembled."
        # aux_files = getInputFiles(outdir, "*", "*.assembled.*", ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        # bulk_move_to_dir(aux_files, makeAuxDir(outdir))

        # Select output files
        aux_files = getInputFiles(outdir, "*", ignore_empty_files=False)
        corrected_dir = "%s/corrected" % outdir
        bulk_move_to_dir(getInputFiles(corrected_dir, "*"), outdir)
        aux_files += getInputFiles(outdir,
                                   "*unpaired*",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "configs", ignore_empty_files=False)

        # Gather aux files
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Rename output files
        output_files = getInputFiles(outdir, "*", "corrected_*")
        for out_file in output_files:
            move(out_file,
                 "%s/%s_corrected.fastq" % (outdir, strip_ixes(out_file)))

        # move the last minute log file
        try:
            move("%s/corrected_corrected.fastq" % outdir,
                 "%s/corrected_corrected.fastq" % aux_dir)
        except:
            pass
        cleanup_pool(pool)
Example #13
0
    def query_fasta_vsearch(self, input_f, referencefasta, taxinfo, outdir, processes, simmilarity, coverage,
                            extraargstring):
        """Compare reference sequences to the fasta-formatted query sequences, using global pairwise alignment.

        :param input_f:  Filepath to a file or folder of files to identify.
        :param outdir: Filepath to the output directory.
        :param referencefasta: Filepath to a file or folder of files to use as a reference.
        :param taxinfo:  Filepath to a file containing taxonomic info correlated with the referencefasta.
        :param simmilarity: The % simmilarity between a query and reference sequence required for positive
                                identification.
        :param coverage: The % coverage of matching regions between a query and reference sequence required for positive
                            identification.
        :param processes: The number of processes to use in the identification process.
        :param extraargstring: Advanced program parameter string.
        """
        # vsearch --usearch_global %s seeds.pick.fasta  --db ../data/BiocodePASSED_SAP.txt --id 0.9 \
        #       --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt

        # expecting a fasta to annotate
        query_fastas = getInputFiles(input_f)
        debugPrintInputInfo(query_fastas, "queried for identification.")
        ref_fastas = getInputFiles(referencefasta)
        debugPrintInputInfo(ref_fastas, "referenced for sequence identification.")
        tax_info_files = getInputFiles(taxinfo)
        debugPrintInputInfo(tax_info_files, "referenced for taxanomic names.")

        # make sure the number of reference fasta files is the same as the number of tax_info files
        if len(tax_info_files) != len(ref_fastas):
            print "Error: The number of reference fastas and taxonomic mapping files is not the same.  There must be \
                    one taxonomic mapping file for each reference fasta."
            return
        ref_data_pairs = zip(ref_fastas, tax_info_files)
        inputs = [x for x in product(query_fastas, ref_fastas)]
        aln_user_string = ""
        pool = init_pool(min(len(inputs), processes))

        # VSEARCH ALIGNMENT
        query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string, extraargstring, pool)

        printVerbose("Parsing output...")
        # Parse the alignment results and put those that pass the criterion (97 similarity, 85 coverage) in
        # parsed_BIOCODE.out.  Parameters can be changed and this command can be rerun as many times as necessary
        #
        # parseVSearchOutputAgainstFasta(vsearch_outfile, taxInfo, output_file, min_simmilarity, min_coverage):
        inputs = [x for x in product(query_fastas, ref_data_pairs)]
        debugPrintInputInfo(inputs, "queryied against paired refereces.")
        run_parallel([PythonRunner(parseVSearchOutputAgainstFasta,
                                   ["%s/%s.out" % (outdir, strip_ixes(query)), tax_info,
                                    "%s/%s.tax" % (outdir, strip_ixes(query)), simmilarity, coverage],
                                   {"exists": [query, ref_fasta, tax_info]})
                      for query, (ref_fasta, tax_info) in inputs], pool)
        printVerbose("\nDone parsing...")

        # Gather and move auxillary files
        aux_files = getInputFiles(outdir, "*", "*.tax", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))

        cleanup_pool(pool)
Example #14
0
    def query_fasta_db_vsearch(self, input_f, outdir, ref_fasta, ref_db,
                               simmilarity, coverage, processes,
                               extraargstring):
        """Compare reference sequences to the fasta-formatted query sequences, using global pairwise alignment.

        :param input_f:  Filepath to a file or folder of files to identify.
        :param outdir: Filepath to the output directory.
        :param ref_fasta: Filepath to the curated fasta file to use as a reference.
        :param ref_db: Filepath to the curated fasta file to use as a reference.
        :param simmilarity:"Minimum % simmilarity (decimal between 0 and 1) between query and reference sequences
                            required for positive identification.
        :param coverage:Minimum % coverage (decimal between 0 and 1) required query and reference sequences required
                            for positive identification.
        :param processes: The number of processes to use in the identification process.
        :param extraargstring: Advanced program parameter string.
        """
        # blast6 output format http://www.drive5.com/usearch/manual/blast6out.html
        aln_user_string = "--userfields query+target+id+alnlen+qcov"
        # coi_fasta = os.path.expanduser("~/ARMS/refs/COI.fasta")
        # ncbi_db_string = os.path.expanduser("~/ARMS/refs/ncbi.db")
        coi_fasta = ref_fasta
        ncbi_db_string = ref_db

        query_fastas = getInputFiles(input_f)
        debugPrintInputInfo(query_fastas, "queried against the DB.")
        inputs = [x for x in product(query_fastas, [coi_fasta])]
        pool = init_pool(min(len(query_fastas), processes))

        # VSEARCH ALIGNMENT
        query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string,
                      extraargstring, pool)

        printVerbose("Parsing output...")
        # Parse the alignment results and put those that pass the criterion (97 similarity, 85 coverage) in
        # parsed_BIOCODE.out.  Parameters can be changed and this command can be rerun as many times as necessary
        #
        # parseVSearchOutputAgainstNCBI(vsearch_out, ncbi_db, min_coverage, min_similarity)> parsed_nt.out
        run_parallel([
            PythonRunner(parseVSearchOutputAgainstNCBI, [
                "%s/%s.out" % (outdir, strip_ixes(query)), ncbi_db_string,
                "%s/%s.tax" %
                (outdir, strip_ixes(query)), simmilarity, coverage
            ], {"exits": [query, ncbi_db_string]}) for query in query_fastas
        ], pool)
        printVerbose("Done processing.")

        # Gather and move auxillary files
        aux_files = getInputFiles(outdir,
                                  "*",
                                  "*.tax",
                                  ignore_empty_files=False)
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))

        cleanup_pool(pool)
 def convert_chewbacca(self, input_f, outdir, proceses):
     inputs = getInputFiles(input_f)
     debugPrintInputInfo(inputs, "convert to fasta.")
     printVerbose("Converting to fasta...")
     pool = init_pool(min(len(inputs), proceses))
     run_parallel([PythonRunner(translateFastqToFasta,
                                [input_, "%s/%s.fasta" % (outdir, getFileName(input_))],
                                {"exists": input_})
                                 for input_ in inputs], pool)
     printVerbose("Done converting.")
     cleanup_pool(pool)
 def convert_chewbacca(self, input_f, outdir, proceses):
     inputs = getInputFiles(input_f)
     debugPrintInputInfo(inputs, "convert to fasta.")
     printVerbose("Converting to fasta...")
     pool = init_pool(min(len(inputs), proceses))
     run_parallel([
         PythonRunner(
             translateFastqToFasta,
             [input_, "%s/%s.fasta" %
              (outdir, getFileName(input_))], {"exists": input_})
         for input_ in inputs
     ], pool)
     printVerbose("Done converting.")
     cleanup_pool(pool)
def parseVSearchOutputAgainstNCBI(vsearch_out, database, output_file,
                                  min_coverage, min_similarity):
    """Resolves vsearch matches in a vsearch output file to the taxonomic name taken from BOLD.
        Takes in a vsearch output file from usearch__global, parses the result for good matches, and
        writes an output file mapping sequence name to taxa name.

    :param vsearch_out: An output file from vsearch's usearch__global program.
    :param database: The database used as part of the vsearch usearch__global operation.
    :param output_file: Where to write the resulting file that maps sequence ID to taxanomic name.
    :param min_coverage: The minimum coverage for an acceptable vsearch match.
    :param min_similarity: The minimum simmilarity for an acceptable vsearch match.
    """
    min_simm = float(min_similarity)
    min_coverage = float(min_coverage)
    ncbi = NCBITaxa()
    conn = sqlite3.connect(database)
    c = conn.cursor()

    query = "select taxid from gi_taxid where gi=%s"

    def getTaxFromId(taxId,
                     taxonomy=[
                         "species", "genus", 'family', 'order', 'class',
                         'phylum'
                     ]):
        myTaxonomy = dict([(a, "") for a in taxonomy])
        taxId = int(taxId)
        for lin in ncbi.get_lineage(taxId):
            rank = ncbi.get_rank([lin]).values()[0]
            if rank in taxonomy:
                val = ncbi.get_taxid_translator([lin]).values()[0]
                myTaxonomy[rank] = val

        return ":".join([myTaxonomy[x] for x in taxonomy[::-1]])

    with open(output_file, 'w') as out:
        for line in open(vsearch_out, 'r'):
            data = line.split()

            if float(data[4]) > min_coverage or float(data[2]) > min_simm:
                hit = c.execute(query % data[1]).fetchone()
                if hit:
                    taxonomy = getTaxFromId(hit[0])
                    data.append(taxonomy)
                    printVerbose("\t".join(data))
                    out.write("\t".join(data))
                    out.write("\n")
                else:
                    printErrorMissingID(out, data[1])
Example #18
0
    def clean_trim_adapters_flexbar(self, input_f, adapters, adaptersrc,
                                    outdir, allowedns, processes,
                                    extraargstring):
        """Use flexbar to trim adapters and barcodes from sequences.  By default, Flexbar does not allow any 'N' \
            characters in SEQUENCE, and will toss any sequences that do contain 'N'.  To avoid this, use the -u or \
            --allowedns flags to specify the maximum number of 'N's to allow

        :param input_f: Filepath to input file or folder.
        :param adapters: Filepath to a list of adapters.
        :param adaptersrc: Filepath to a list of reverse-complemented adapters.
        :param outdir: Filepath to the output directory.
        :param allowedns: Non-negative integer value indicating the maximum number of 'N's to tolerate in a sequence.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        debugPrintInputInfo(inputs, "trim adapters from")
        # "flexbar":  "flexbar -r \"%s\" -t \"%s\" -ae \"%s\" -a \"%s\"",
        printVerbose("Trimming barcodes and adapters with flexbar")
        temp_file_name_template = "%s/temp_%s"
        debarcoded_file_name_template = "%s/%s_debarcoded"
        # Trim adapters from the left
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [
                input_file, temp_file_name_template %
                (outdir, strip_ixes(input_file)), "LEFT", adapters, allowedns
            ], {"exists": [input_file, adapters]}, extraargstring)
            for input_file in inputs
        ], pool)

        temp_files = getInputFiles(outdir, "temp_*")
        debugPrintInputInfo(temp_files, "trim adapters from")

        # Trim the reverse complemented adapters from the right
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [
                input_file, debarcoded_file_name_template %
                (outdir, strip_ixes(input_file)[5:]), "RIGHT", adaptersrc,
                allowedns
            ], {"exists": [input_file, adaptersrc]}, extraargstring)
            for input_file in temp_files
        ], pool)
        printVerbose("Done Trimming sequences.")

        # Move temp files
        aux_files = getInputFiles(outdir, "temp_*", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)
Example #19
0
def annotateOTUtable(otu_file,
                     annotation_file,
                     out_file,
                     id_col=0,
                     tax_col=6,
                     clip_count_from_annotations=True):
    """Given the best hits from a cleaned, annotated (with taxonomic names), Vsearch out file, renames each
        sequence ID in OTU table with its taxonomic name in the Vsearch outfile.

    :param otu_file: The otu file to annotate.
    :param annotation_file: Vsearch output file, cleaned by parse.ParseVsearchOutForTaxa.py
    :param out_file: Filepath to write the resulting annotated OTU file.
    :param id_col: The column number (zero-indexed) in the Vsearch file containing sequence fasta IDs.
    :param tax_col: The column number (zero-indexed) in the Vsearch file containing the taxonomic IDs.
    :param clip_count_from_annotations: If True, clip dereplication counts from Vsearch sequence fasta IDs.
        Default: True.
    :return: Filepath to the resulting annotated OTU table.
    """

    # CREATE A DICTIONARY MAPPING SEQUENCE ID TO TAXONOMIC NAMES
    id_to_tax = {}
    for line in open(annotation_file, 'r'):
        data = line.split("\t")
        id = data[id_col].rstrip()
        if clip_count_from_annotations:
            id = clip_count(id, '_')
        tax = data[tax_col].rstrip()
        id_to_tax[id] = tax

    identifiable = id_to_tax.keys()
    printVerbose("Constructed a dictionary of %d identities from %s." %
                 (len(id_to_tax), annotation_file))
    # Parse through the matrix file again to replace any identifiable ids and reformat the file
    printVerbose("Annotating %s with found identities." % otu_file)
    with open(out_file, 'w') as out:
        for line in open(otu_file, 'r'):
            line = line.rstrip()
            current_id = line.split()[id_col]

            if current_id in identifiable:
                tax = id_to_tax[current_id]
                out_line = "%s\t%s\n" % (line, "".join(tax.split()))
                out.write(out_line)
            else:
                out_line = "%s\tUnclassified\n" % line
                out.write(out_line)
    return out_file
Example #20
0
def splitK(inputFasta, prefix, nbSeqsPerFile, filetype):
    mySeqs = SeqIO.parse(inputFasta, filetype)
    chunk = 0
    sequences = []

    for mySeq in mySeqs:
        mySeq.seq  = mySeq.seq.ungap(".")
        if len(mySeq.seq) < 200:
            continue
        sequences.append(mySeq)
        if len(sequences) % nbSeqsPerFile == 0:
            SeqIO.write(sequences, open("%s_part_%d.%s" % (str(prefix), chunk, filetype), 'w'), filetype)
            sequences=[]
            chunk+=1
    if sequences:
        SeqIO.write(sequences, open("%s_part_%d.%s" % (str(prefix), chunk, filetype), 'w'), filetype)
    printVerbose("Split %s into %d parts." % (inputFasta, (chunk + 1)))
Example #21
0
def get_best_hits_from_vsearch(input_fna, ref_fna, outdir, id_pct=.7):
    """Calls vsearch with an input fasta, and returns a dictionary mapping each sequence to its best hit. (subject to
        the ID threshold (70%) in vsearch (See ProgramRunnerCommands.ALIGN_VSEARCH).

    :param input_fna: string.  Filepath to the input fna fasta file.
    :param ref_fna: string. Filepath to the reference fna fasta file.
    :param outdir: string. Filepath to the output directory for the hits file.
    :return: {string:string} A dictionary mapping input sequence names to the best hit in the reference DB.
    """
    def best_hits_from_vsearch(v_search_output):
        best_hits = {}
        for line in open(v_search_output, 'r'):
            data = line.split("\t")
            query_name = data[0].rstrip()
            if best_hits.has_key(query_name):
                if float(best_hits[query_name][2].rstrip()) < float(
                        data[2].rstrip()):
                    best_hits[query_name] = data
            else:
                best_hits[query_name] = data
        return best_hits

    threads = 1
    pool = init_pool(threads)
    #printVerbose.VERBOSE = True
    print "calling vsearch"
    processes = 1
    aln_user_string = ""
    extraargstring = ""

    printVerbose("Aligning against reference sequences...")
    #     # vsearch --usearch_global %s seeds.pick.fasta  --db ../data/BiocodePASSED_SAP.txt --id 0.9 \
    # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt
    ProgramRunner(ProgramRunnerCommands.ALIGN_VSEARCH, [
        processes, input_fna, ref_fna, id_pct,
        "%s/%s.out" % (outdir, strip_ixes(input_fna)),
        "%s/%s.alnout" % (outdir, strip_ixes(input_fna)), aln_user_string
    ], {
        "exists": [input_fna, ref_fna],
        "positive": [processes]
    }, extraargstring).run()

    vsearch_output = "%s/%s.out" % (outdir, strip_ixes(input_fna))

    # Choose the best hit
    return best_hits_from_vsearch(vsearch_output)
Example #22
0
    def visualize_otu_heatmap(self, data_frame, output_file):
        """Visualizes an OTU table as a heatmap, showing OTU abundance in each Sample.

        :param data_frame: A pandas dataframe to graph.
        :param output_file: Filepath to the output graphics file.
        """
        ncols = len(data_frame.columns.values)
        nrows = len(data_frame.index.values)
        printVerbose("Computing dataframe values...")
        fig, ax = plt.subplots(figsize=(10 + ncols / .5, 7 + nrows / 10.0))
        heatmap = ax.pcolor(data_frame, cmap=plt.cm.binary)
        ax.set_xticks(np.arange(ncols) + 0.5)
        ax.set_yticks(np.arange(nrows) + 0.5)
        ax.set_xticklabels(data_frame.columns.values, rotation=90, fontsize=4)
        ax.set_yticklabels(data_frame.index.values, fontsize=4)
        cbar = plt.colorbar(heatmap)
        printVerbose("Saving image %s..." % output_file)
        plt.savefig(output_file, dpi=200)
    def visualize_otu_heatmap(self, data_frame, output_file):
        """Visualizes an OTU table as a heatmap, showing OTU abundance in each Sample.

        :param data_frame: A pandas dataframe to graph.
        :param output_file: Filepath to the output graphics file.
        """
        ncols = len(data_frame.columns.values)
        nrows = len(data_frame.index.values)
        printVerbose("Computing dataframe values...")
        fig, ax = plt.subplots(figsize=(10 + ncols / .5, 7 + nrows / 10.0))
        heatmap = ax.pcolor(data_frame, cmap=plt.cm.binary)
        ax.set_xticks(np.arange(ncols) + 0.5)
        ax.set_yticks(np.arange(nrows) + 0.5)
        ax.set_xticklabels(data_frame.columns.values, rotation=90, fontsize=4)
        ax.set_yticklabels(data_frame.index.values, fontsize=4)
        cbar = plt.colorbar(heatmap)
        printVerbose("Saving image %s..." % output_file)
        plt.savefig(output_file, dpi=200)
def parseVSearchOutputAgainstNCBI(vsearch_out, database, output_file, min_coverage, min_similarity):
    """Resolves vsearch matches in a vsearch output file to the taxonomic name taken from BOLD.
        Takes in a vsearch output file from usearch__global, parses the result for good matches, and
        writes an output file mapping sequence name to taxa name.

    :param vsearch_out: An output file from vsearch's usearch__global program.
    :param database: The database used as part of the vsearch usearch__global operation.
    :param output_file: Where to write the resulting file that maps sequence ID to taxanomic name.
    :param min_coverage: The minimum coverage for an acceptable vsearch match.
    :param min_similarity: The minimum simmilarity for an acceptable vsearch match.
    """
    min_simm = float(min_similarity)
    min_coverage = float(min_coverage)
    ncbi = NCBITaxa()
    conn = sqlite3.connect(database)
    c = conn.cursor()

    query = "select taxid from gi_taxid where gi=%s"

    def getTaxFromId(taxId, taxonomy=["species", "genus", 'family', 'order', 'class', 'phylum']):
        myTaxonomy = dict([(a, "") for a in taxonomy])
        taxId = int(taxId)
        for lin in ncbi.get_lineage(taxId):
            rank = ncbi.get_rank([lin]).values()[0]
            if rank in taxonomy:
                val = ncbi.get_taxid_translator([lin]).values()[0]
                myTaxonomy[rank] = val

        return ":".join([myTaxonomy[x] for x in taxonomy[::-1]])

    with open(output_file, 'w') as out:
        for line in open(vsearch_out, 'r'):
            data = line.split()

            if float(data[4]) > min_coverage or float(data[2]) > min_simm:
                hit = c.execute(query % data[1]).fetchone()
                if hit:
                    taxonomy = getTaxFromId(hit[0])
                    data.append(taxonomy)
                    printVerbose("\t".join(data))
                    out.write("\t".join(data))
                    out.write("\n")
                else:
                    printErrorMissingID(out, data[1])
Example #25
0
def get_best_hits_from_vsearch(input_fna, ref_fna, outdir, id_pct=.7):
    """Calls vsearch with an input fasta, and returns a dictionary mapping each sequence to its best hit. (subject to
        the ID threshold (70%) in vsearch (See ProgramRunnerCommands.ALIGN_VSEARCH).

    :param input_fna: string.  Filepath to the input fna fasta file.
    :param ref_fna: string. Filepath to the reference fna fasta file.
    :param outdir: string. Filepath to the output directory for the hits file.
    :return: {string:string} A dictionary mapping input sequence names to the best hit in the reference DB.
    """
    def best_hits_from_vsearch(v_search_output):
        best_hits = {}
        for line in open(v_search_output, 'r'):
            data = line.split("\t")
            query_name = data[0].rstrip()
            if best_hits.has_key(query_name):
                if float(best_hits[query_name][2].rstrip()) < float(data[2].rstrip()):
                    best_hits[query_name] = data
            else:
                best_hits[query_name] = data
        return best_hits

    threads = 1
    pool = init_pool(threads)
    #printVerbose.VERBOSE = True
    print "calling vsearch"
    processes=1
    aln_user_string=""
    extraargstring=""


    printVerbose("Aligning against reference sequences...")
    #     # vsearch --usearch_global %s seeds.pick.fasta  --db ../data/BiocodePASSED_SAP.txt --id 0.9 \
    # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt
    ProgramRunner(ProgramRunnerCommands.ALIGN_VSEARCH,
                            [processes, input_fna, ref_fna, id_pct, "%s/%s.out" % (outdir, strip_ixes(input_fna)),
                             "%s/%s.alnout" % (outdir, strip_ixes(input_fna)), aln_user_string],
                            {"exists": [input_fna, ref_fna], "positive": [processes]},
                            extraargstring).run()

    vsearch_output = "%s/%s.out" % (outdir, strip_ixes(input_fna))

    # Choose the best hit
    return best_hits_from_vsearch(vsearch_output)
def annotateOTUtable(otu_file, annotation_file, out_file, id_col=0, tax_col=6, clip_count_from_annotations=True):
    """Given the best hits from a cleaned, annotated (with taxonomic names), Vsearch out file, renames each
        sequence ID in OTU table with its taxonomic name in the Vsearch outfile.

    :param otu_file: The otu file to annotate.
    :param annotation_file: Vsearch output file, cleaned by parse.ParseVsearchOutForTaxa.py
    :param out_file: Filepath to write the resulting annotated OTU file.
    :param id_col: The column number (zero-indexed) in the Vsearch file containing sequence fasta IDs.
    :param tax_col: The column number (zero-indexed) in the Vsearch file containing the taxonomic IDs.
    :param clip_count_from_annotations: If True, clip dereplication counts from Vsearch sequence fasta IDs.
        Default: True.
    :return: Filepath to the resulting annotated OTU table.
    """

    

    # CREATE A DICTIONARY MAPPING SEQUENCE ID TO TAXONOMIC NAMES
    id_to_tax = {}
    for line in open(annotation_file, 'r'):
        data = line.split("\t")
        id = data[id_col].rstrip()
        if clip_count_from_annotations:
            id = clip_count(id,'_')
        tax = data[tax_col].rstrip()
        id_to_tax[id] = tax
    
    identifiable = id_to_tax.keys()
    printVerbose("Constructed a dictionary of %d identities from %s." % (len(id_to_tax), annotation_file))
    # Parse through the matrix file again to replace any identifiable ids and reformat the file
    printVerbose("Annotating %s with found identities." % otu_file)
    with open(out_file, 'w') as out:
        for line in open(otu_file, 'r'):
            line = line.rstrip()
            current_id = line.split()[id_col]
            
            if  current_id in identifiable:
                tax = id_to_tax[current_id]
                out_line = "%s\t%s\n" % (line, "".join(tax.split()))
                out.write(out_line)
            else:
                out_line = "%s\tUnclassified\n" % line
                out.write(out_line)
    return out_file
    def clean_trim_adapters_flexbar(self, input_f, adapters, adaptersrc, outdir, allowedns, processes, extraargstring):
        """Use flexbar to trim adapters and barcodes from sequences.  By default, Flexbar does not allow any 'N' \
            characters in SEQUENCE, and will toss any sequences that do contain 'N'.  To avoid this, use the -u or \
            --allowedns flags to specify the maximum number of 'N's to allow

        :param input_f: Filepath to input file or folder.
        :param adapters: Filepath to a list of adapters.
        :param adaptersrc: Filepath to a list of reverse-complemented adapters.
        :param outdir: Filepath to the output directory.
        :param allowedns: Non-negative integer value indicating the maximum number of 'N's to tolerate in a sequence.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        debugPrintInputInfo(inputs, "trim adapters from")
        # "flexbar":  "flexbar -r \"%s\" -t \"%s\" -ae \"%s\" -a \"%s\"",
        printVerbose("Trimming barcodes and adapters with flexbar")
        temp_file_name_template = "%s/temp_%s"
        debarcoded_file_name_template = "%s/%s_debarcoded"
        # Trim adapters from the left
        run_parallel([ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR,
                                    [input_file, temp_file_name_template % (outdir, strip_ixes(input_file)),
                                     "LEFT", adapters, allowedns],
                                    {"exists": [input_file, adapters]}, extraargstring)
                      for input_file in inputs], pool)

        temp_files = getInputFiles(outdir, "temp_*")
        debugPrintInputInfo(temp_files, "trim adapters from")

        # Trim the reverse complemented adapters from the right
        run_parallel([ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR,
                                    [input_file, debarcoded_file_name_template % (outdir, strip_ixes(input_file)[5:]),
                                     "RIGHT", adaptersrc, allowedns],
                                    {"exists": [input_file, adaptersrc]}, extraargstring)
                      for input_file in temp_files], pool)
        printVerbose("Done Trimming sequences.")

        # Move temp files
        aux_files = getInputFiles(outdir, "temp_*", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)
Example #28
0
    def annotate_otu_chewbacca(self, input_f, outdir, annotation, processes):
        """Annotates an OTU table.

        :param input_f: Filepath to a file or folder of files to annotate.
        :param annotation: Filepath to a file or a folder of files to use as annotations.
        :param outdir: Filepath to the output directory where annotated files will be written.
        :param procs: The maximum number of procs to use.
        """
        # matrixes = getInputFiles(input_f)
        matricies = getInputFiles(input_f)
        debugPrintInputInfo(matricies, "annotated.")
        annotations = getInputFiles(annotation)

        # if all the annotations files are empty, just copy over files.
        if len(annotations) == 0 and len(
                getInputFiles(annotation, ignore_empty_files=False)) > 0:
            pool = init_pool(min(len(matricies), processes))
            print "**WARNING**: Annotation File is empty.  Skipping annotation and copying old OTU tables to output \
                    directory.\n"

            run_parallel([
                PythonRunner(copy_file, [matrix, outdir], {"exists": [matrix]})
                for matrix in matricies
            ], pool)
        else:
            pool = init_pool(min(len(matricies) * len(annotations), processes))
            debugPrintInputInfo(annotations, "parsed.")
            inputs = product(matricies, annotations)

            printVerbose("Annotating matrix...")

            run_parallel([
                PythonRunner(
                    annotateOTUtable,
                    [matrix, annotation,
                     "%s/%s.txt" %
                     (outdir, "matrix")], {"exists": [matrix, annotation]})
                for matrix, annotation in inputs
            ], pool)
            printVerbose("Done Annotating.")

        cleanup_pool(pool)
    def ungap_chewbacca(self, input_f, outdir, gapchars, file_ext, processes):
        """Ungaps a character using Bio python.

            :param input_f: Filepath to input file or folder to ungap.
            :param outdir: Filepath to the output directory where ungapped files should be written.
            :param gapchars: A string containing the gap characters to remove.
            :param file_ext: Either 'fasta' or 'fastq'.
            :param processes: The number of threads to use to ungap the input fileset.
        """
        inputs = getInputFiles(input_f, "*.fasta")
        debugPrintInputInfo(inputs, "ungap.")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Removing all '%s' from sequences..." % gapchars)
        # ungap(file_to_clean, output_file_name, gap_char, file_type):
        run_parallel([PythonRunner(remove_gap_chars,
                                   [input_, "%s/%s_cleaned.%s" % (outdir, strip_ixes(input_), 'fasta'),
                                   gapchars, file_ext],
                                   {"exists": [input_]}) for input_ in inputs], pool)
        printVerbose("Done removing.")
        cleanup_pool(pool)
    def partition_chewbacca(self, input_f, outdir, processes, chunksize, filetype):
        """Partition a fasta/fastq file into chunks of user-defined size.

        :param input_f: Filepath to a file or folder of files to partition.
        :param outdir: The directory to write split files to.
        :param processes: The number of processes to use to partition the input fileset.
        :param chunksize: The number of sequences per file.
        :param filetype: Either 'fasta' or 'fastq'.
        """
        # Gather input files
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "partitioned")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Partitioning Files...")
        run_parallel([PythonRunner(splitK,
                                   [input_, "%s/%s" % (outdir, strip_ixes(input_)), chunksize, filetype],
                                   {"exists": [input_]})
                  for input_ in inputs], pool)
        printVerbose("Done partitioning files.")
        cleanup_pool(pool)
Example #31
0
def get_best_hits_from_vsearch(input_fna, ref_fna, outdir):

    def best_hits_from_vsearch(v_search_output):
        best_hits = {}
        for line in open(v_search_output, 'r'):
            data = line.split("\t")
            query_name = data[0].rstrip()
            if best_hits.has_key(query_name):
                if float(best_hits[query_name][2].rstrip()) < float(data[2].rstrip()):
                    best_hits[query_name] = data
            else:
                best_hits[query_name] = data
        return best_hits


    threads = 1
    pool = init_pool(threads)
    #printVerbose.VERBOSE = True
    print "calling vsearch"
    # Search for good hits
    inputs = [(input_fna, ref_fna)]
    processes=1
    aln_user_string=""
    extraargstring=""


    printVerbose("Aligning against reference sequences...")
    #     # vsearch --usearch_global %s seeds.pick.fasta  --db ../data/BiocodePASSED_SAP.txt --id 0.9 \
    # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt
    ProgramRunner(ProgramRunnerCommands.ALIGN_VSEARCH,
                            [processes, input_fna, ref_fna, "%s/%s.out" % (outdir, strip_ixes(input_fna)),
                             "%s/%s.alnout" % (outdir, strip_ixes(input_fna)), aln_user_string],
                            {"exists": [input_fna, ref_fna], "positive": [processes]},
                            extraargstring).run()


    print "cleaning up."
    vsearch_output = "%s/%s.out" % (outdir, strip_ixes(input_fna))

    # Choose the best hit
    return best_hits_from_vsearch(vsearch_output)
def split_on_name(input_f, barcodes_file, outdir, id_, filetype):
    """Demuxes a single input file into separate files.  Matches unique strings in sequence names to the unique sample
        names in the barcodes_file.

    :param input_f: Filepath to the fasta file to demux.
    :param barcodes_file: Filepath tot he barcodes file to parse.  Note: only the sequence names are read. Barcode data
                            can be faked.
    :param outdir: Filepath to the output directory where demuxed fasta files should be written.
    :param id_: A unique integer id for a fasta file to demux.
    :param filetype: Either 'fasta' or 'fastq' indicating the input and output filetypes.
    """

    unmatched_name = "unmatched"
    sample_names = parse_barcodes_to_dict(barcodes_file).keys()
    sample_names.append(unmatched_name)
    sample_names.sort(reverse=True)
    printVerbose("Possible samples:")
    printVerbose(str(sample_names))
    out_streams = {}
    for sample_name in sample_names:
        outfile =  "%s/%s_%s_splitOut.fastq" % ( outdir, sample_name, id_)
        out_streams[sample_name] = BufferedSeqWriter(outfile, filetype)

    SeqIO.parse(input_f, filetype)
    seq_dict = SeqIO.index(input_f, filetype)

    for name in seq_dict.keys():
        matched = False
        for sample_name in sample_names:
            if sample_name in name:
                out_streams[sample_name].write(seq_dict[name])
                matched = True
                break
        if not matched:
            out_streams[unmatched_name].write(seq_dict[name])

    for writer in out_streams.keys():
            out_streams[writer].flush()
            # Remove empty files
            if out_streams[writer].empty:
                out_streams[writer].delete()
    def build_otu_chewbacca(self, outdir, groups_file, samples_file, barcodes_file):
        """Builds the unannotated OTU table using custom chewbacca script.

        :param outdir: The directory where the matrix should be written.
        :param groups_file: A .groups file containing the OTU names and their consituent/replicant sequences.
        :param samples_file: A .samples file containing the samples that each sequence in the .groups file belongs to.
        :param barcodes_file: A .barcodes file listing all sample names.
        :param extraargstring: Advanced program parameter string.
        """
        groups = getInputFiles(groups_file)
        debugPrintInputInfo(groups, "read.")

        samples = getInputFiles(samples_file)
        debugPrintInputInfo(samples, "read.")

        barcodes = getInputFiles(barcodes_file)
        debugPrintInputInfo(barcodes, "read.")

        printVerbose("Building matrix...")
        buildOTUtable(groups, samples, barcodes[0], "%s/%s.txt" % (outdir, "matrix"))
        printVerbose("Done building.")
def parseGroupsFileToDict(groups_file, thing_to_map):
    """Given a .groups file, returns a dictionary mapping each seed to either a count of its children, or a
        space-delimited string of its children's names.

    :param groups_file: A .groups file.
    :param thing_to_map: Specify 'children' to map seed names to a space-delimited string of children names, or 'counts'
                            to map seed names to a count of children.
    :return: A dictionary mapping each seed to either a count of its children, or a space-delimited string of its
            children's names
    """
    groups = {}
    printVerbose("Reading count file: %s" % groups_file)
    # collect the seed names, and the children sequence names
    i = 0
    nb_lines = 0
    for line in open(groups_file, 'r'):
        nb_lines +=1
        data = line.rstrip().split("\t")
        seed = data[0]
        children = ""
        if thing_to_map == "children":
            if len(data) > 1:
                children = ' '.join(list(set(data[1:])))
            groups[seed] = children
        if thing_to_map == "counts":
            if len(data) > 1:
                children = data[1]
            groups[seed] = len(children.split(" "))

        if nb_lines % 100000 == 0:
            printVerbose("%s lines processed" % nb_lines)
    printVerbose("Done reading count file.")
    return groups
Example #35
0
def handle_groups_file_update(outdir, groupsfile,
                              clustering_groups_files_uncount):
    """Checks if the user specified groups file exists, and updates the groupings with clustering data.
        Returns a list of the most up to date groups files.
    :param outdir: Filepath to the directory where outputfiles will be written.
    :param groupsfile: Optional filepath to the .groups file, or a folder of .groups files to use as a reference.
    :param clustering_groups_files_uncount: The output groups file from clustering, with trailing replication counts
        removed from sequence names.  Names in this file should match those used in the user-specified groups file
        groupsfile.
    :return: A list of filenames pointing to the most up to date groups files.
    """
    most_recent_groups_files = clustering_groups_files_uncount
    if groupsfile:
        # Try to grab groups files
        user_specified_groups_files = getInputFiles(groupsfile, critical=False)
        # If we have files at the given location
        if len(user_specified_groups_files) != 0:
            most_recent_groups_files = user_specified_groups_files
            printVerbose("Updating .groups files with clustering data")
            debugPrintInputInfo(most_recent_groups_files,
                                "used as groups references")
            update_groups(most_recent_groups_files,
                          clustering_groups_files_uncount, outdir,
                          "postcluster")
            printVerbose("Done updating .groups files.")
            most_recent_groups_files = getInputFiles(outdir,
                                                     "postcluster*.groups")
    else:
        printVerbose("No name files provided, assuming singletons...\n")
    return most_recent_groups_files
def parseGroupsFileToDict(groups_file, thing_to_map):
    """Given a .groups file, returns a dictionary mapping each seed to either a count of its children, or a
        space-delimited string of its children's names.

    :param groups_file: A .groups file.
    :param thing_to_map: Specify 'children' to map seed names to a space-delimited string of children names, or 'counts'
                            to map seed names to a count of children.
    :return: A dictionary mapping each seed to either a count of its children, or a space-delimited string of its
            children's names
    """
    groups = {}
    printVerbose("Reading count file: %s" % groups_file)
    # collect the seed names, and the children sequence names
    i = 0
    nb_lines = 0
    for line in open(groups_file, 'r'):
        nb_lines += 1
        data = line.rstrip().split("\t")
        seed = data[0]
        children = ""
        if thing_to_map == "children":
            if len(data) > 1:
                children = ' '.join(list(set(data[1:])))
            groups[seed] = children
        if thing_to_map == "counts":
            if len(data) > 1:
                children = data[1]
            groups[seed] = len(children.split(" "))

        if nb_lines % 100000 == 0:
            printVerbose("%s lines processed" % nb_lines)
    printVerbose("Done reading count file.")
    return groups
def handle_groups_file_update(outdir, groupsfile, clustering_groups_files_uncount):
    """Checks if the user specified groups file exists, and updates the groupings with clustering data.
        Returns a list of the most up to date groups files.
    :param outdir: Filepath to the directory where outputfiles will be written.
    :param groupsfile: Optional filepath to the .groups file, or a folder of .groups files to use as a reference.
    :param clustering_groups_files_uncount: The output groups file from clustering, with trailing replication counts
        removed from sequence names.  Names in this file should match those used in the user-specified groups file
        groupsfile.
    :return: A list of filenames pointing to the most up to date groups files.
    """
    most_recent_groups_files = clustering_groups_files_uncount
    if groupsfile:
        # Try to grab groups files
        user_specified_groups_files = getInputFiles(groupsfile, critical=False)
        # If we have files at the given location
        if len(user_specified_groups_files) != 0:
            most_recent_groups_files = user_specified_groups_files
            printVerbose("Updating .groups files with clustering data")
            debugPrintInputInfo(most_recent_groups_files, "used as groups references")
            update_groups(most_recent_groups_files, clustering_groups_files_uncount, outdir, "postcluster")
            printVerbose("Done updating .groups files.")
            most_recent_groups_files = getInputFiles(outdir, "postcluster*.groups")
    else:
        printVerbose("No name files provided, assuming singletons...\n")
    return most_recent_groups_files
Example #38
0
    def rename_chewbacca(self, input_f, outdir, filetype, clip, processes):
        """Renames sequences in a fasta/fastq file as <filename>_ID0, <filename>_ID1, <filename>_ID2, etc., where
            <filename> is the name of the fasta/fastq file without any extensions or chewbacca suffixes.

        :param input_f: Filepath to an input file or folder to rename.
        :param outdir: Filepath to the output directory.
        :param filetype: Either 'fasta' or 'fastq'.
        :param clip: If True, remove dereplication counts from sequence names before renaming.
        :param processes: The maximum number of processes to use.
        """

        # Gather input files
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "rename")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Renaming sequences...")
        # Run serialRename in run_parallel
        run_parallel([
            PythonRunner(serialRename, [
                input_,
                "%s/%s_renamed%s" %
                (outdir, strip_ixes(input_), os.path.splitext(input_)[1]),
                filetype, clip
            ], {"exists": [input_]}) for input_ in inputs
        ], pool)
        printVerbose("Done renaming sequences...")

        samples_dir = makeDirOrdie("%s_samples" % outdir)
        samples_files = getInputFiles(outdir,
                                      "*.samples",
                                      ignore_empty_files=False)
        bulk_move_to_dir(samples_files, samples_dir)

        aux_dir = makeAuxDir(outdir)
        aux_files = getInputFiles(outdir,
                                  "*.mapping",
                                  ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)

        cleanup_pool(pool)
Example #39
0
    def partition_chewbacca(self, input_f, outdir, processes, chunksize,
                            filetype):
        """Partition a fasta/fastq file into chunks of user-defined size.

        :param input_f: Filepath to a file or folder of files to partition.
        :param outdir: The directory to write split files to.
        :param processes: The number of processes to use to partition the input fileset.
        :param chunksize: The number of sequences per file.
        :param filetype: Either 'fasta' or 'fastq'.
        """
        # Gather input files
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "partitioned")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Partitioning Files...")
        run_parallel([
            PythonRunner(splitK, [
                input_,
                "%s/%s" % (outdir, strip_ixes(input_)), chunksize, filetype
            ], {"exists": [input_]}) for input_ in inputs
        ], pool)
        printVerbose("Done partitioning files.")
        cleanup_pool(pool)
    def build_otu_chewbacca(self, outdir, groups_file, samples_file,
                            barcodes_file):
        """Builds the unannotated OTU table using custom chewbacca script.

        :param outdir: The directory where the matrix should be written.
        :param groups_file: A .groups file containing the OTU names and their consituent/replicant sequences.
        :param samples_file: A .samples file containing the samples that each sequence in the .groups file belongs to.
        :param barcodes_file: A .barcodes file listing all sample names.
        :param extraargstring: Advanced program parameter string.
        """
        groups = getInputFiles(groups_file)
        debugPrintInputInfo(groups, "read.")

        samples = getInputFiles(samples_file)
        debugPrintInputInfo(samples, "read.")

        barcodes = getInputFiles(barcodes_file)
        debugPrintInputInfo(barcodes, "read.")

        printVerbose("Building matrix...")
        buildOTUtable(groups, samples, barcodes[0],
                      "%s/%s.txt" % (outdir, "matrix"))
        printVerbose("Done building.")
Example #41
0
    def ungap_chewbacca(self, input_f, outdir, gapchars, file_ext, processes):
        """Ungaps a character using Bio python.

            :param input_f: Filepath to input file or folder to ungap.
            :param outdir: Filepath to the output directory where ungapped files should be written.
            :param gapchars: A string containing the gap characters to remove.
            :param file_ext: Either 'fasta' or 'fastq'.
            :param processes: The number of threads to use to ungap the input fileset.
        """
        inputs = getInputFiles(input_f, "*.fasta")
        debugPrintInputInfo(inputs, "ungap.")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Removing all '%s' from sequences..." % gapchars)
        # ungap(file_to_clean, output_file_name, gap_char, file_type):
        run_parallel([
            PythonRunner(remove_gap_chars, [
                input_,
                "%s/%s_cleaned.%s" %
                (outdir, strip_ixes(input_), 'fasta'), gapchars, file_ext
            ], {"exists": [input_]}) for input_ in inputs
        ], pool)
        printVerbose("Done removing.")
        cleanup_pool(pool)
Example #42
0
def splitK(inputFasta, prefix, nbSeqsPerFile, filetype):
    mySeqs = SeqIO.parse(inputFasta, filetype)
    chunk = 0
    sequences = []

    for mySeq in mySeqs:
        mySeq.seq = mySeq.seq.ungap(".")
        if len(mySeq.seq) < 200:
            continue
        sequences.append(mySeq)
        if len(sequences) % nbSeqsPerFile == 0:
            SeqIO.write(
                sequences,
                open("%s_part_%d.%s" % (str(prefix), chunk, filetype), 'w'),
                filetype)
            sequences = []
            chunk += 1
    if sequences:
        SeqIO.write(
            sequences,
            open("%s_part_%d.%s" % (str(prefix), chunk, filetype), 'w'),
            filetype)
    printVerbose("Split %s into %d parts." % (inputFasta, (chunk + 1)))
    def assemble_pear(self, input_f, input_r, outdir, name, processes,
                      pearthreads, extraargstring):
        """Uses PEAR to assemble paired F/R read files in run_parallel.

        :param input_f: File path to forward Fastq Reads file or folder.
        :param input_r: File path to reverse Fastq Reads file or folder.
        :param outdir: File path to the output directory.
        :param name: File prefix for the assembled reads.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        :param pearthreads: The number of threads per process to use.
        """
        # "~/programs/pear-0.9.4-bin-64/pear-0.9.4-64 -f %s -r %s -o %s -j %s -m %d"
        inputs = validate_paired_fastq_reads(input_f, input_r)
        pool = init_pool(min(len(inputs), processes))
        printVerbose("\tAssembling reads with pear")
        debugPrintInputInfo(inputs, "assemble")
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.ASSEMBLE_PEAR, [
                forwards, reverse,
                "%s/%s_%s" % (outdir, name, getFileName(forwards)), pearthreads
            ], {
                "exists": [forwards, reverse],
                "positive": [pearthreads]
            }, extraargstring) for forwards, reverse in inputs
        ], pool)

        printVerbose("Done assembling sequences...")
        # Grab all the auxillary files (everything not containing ".assembled."
        aux_files = getInputFiles(outdir,
                                  "*",
                                  "*.assembled.*",
                                  ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)
    def align_macse(self, input_f, db, outdir, processes, extraargstring):
        """Aligns sequences by iteratively adding them to a known good alignment.

        :param input_f: Filepath to an input file or folder to rename.
        :param db: Filepath to a reference file or folder of reference files for alignment.
        :param outdir: Filepath to the output directory.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        # "macse_align":      "java -jar " + programPaths["MACSE"] + " -prog enrichAlignment  -seq \"%s\" -align \
        #                                    \"%s\" -seq_lr \"%s\" -maxFS_inSeq 0  -maxSTOP_inSeq 0  -maxINS_inSeq 0 \
        #                                    -maxDEL_inSeq 3 -gc_def 5 -fs_lr -10 -stop_lr -10 -out_NT \"%s\"_NT \
        #                                    -out_AA \"%s\"_AA -seqToAdd_logFile \"%s\"_log.csv",

        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Aligning reads using MACSE")
        inputs = getInputFiles(input_f)
        run_parallel([ProgramRunner(ProgramRunnerCommands.MACSE_ALIGN,
                                    [db, db, input_] + ["%s/%s" % (outdir, getFileName(input_))] * 3,
                                    {"exists": [input_, db]}, extraargstring)
                      for input_ in inputs], pool)
        printVerbose("Done with MACSE alignment.")
        cleanup_pool(pool)
Example #45
0
def renameWithReplicantCounts(input_fasta, groups_file, output_fasta,
                              filetype):
    """Covnerts a fasta and a groups file to a sorted, dereplicated, fasta named by abundance.
    Specifically, each seed in the groups file has the number of sequences it represents (including itself) appended as a
    suffix.

     e.g.

    The groups file entry:
          BALI_113_ID1  BALI_113_ID2 BALI_113_ID3

    would be named as
         >BALI_113_ID1_3  in the fasta file to show that it represents 3 sequences (itself, and two other sequences)

    :param input_fasta: Input fasta/fastq file with entries for all items in the groups file.
    :param groups_file:  Input groups file showing clustering/grouping.
    :param output_fasta: Output file path.
    :param filetype: Either 'fasta' or 'fastq'
    :return: Filepath to the output fasta.
    """

    seeds = []
    seedSizes = parseGroupsFileToDictOfCounts(groups_file)

    printVerbose("Indexing reads")
    reads = SeqIO.index(input_fasta, filetype)
    printVerbose("Done indexing reads")

    printVerbose("Renaming sequences")
    for name, count in sorted(seedSizes.items(),
                              key=operator.itemgetter(1),
                              reverse=True):
        s = reads[name]
        s.id = "%s_%s" % (name, count)
        s.description = ""
        seeds.append(s)
        # write in chunks
        if len(seeds) == 500000:
            SeqIO.write(seeds, open(output_fasta, 'a'), filetype)
            seeds = []
    # write the rest of the chunk
    SeqIO.write(seeds, open(output_fasta, 'a'), filetype)
    printVerbose("Done renaming sequences")
    return output_fasta
def renameWithReplicantCounts(input_fasta, groups_file, output_fasta, filetype):
    """Covnerts a fasta and a groups file to a sorted, dereplicated, fasta named by abundance.
    Specifically, each seed in the groups file has the number of sequences it represents (including itself) appended as a
    suffix.

     e.g.

    The groups file entry:
          BALI_113_ID1  BALI_113_ID2 BALI_113_ID3

    would be named as
         >BALI_113_ID1_3  in the fasta file to show that it represents 3 sequences (itself, and two other sequences)

    :param input_fasta: Input fasta/fastq file with entries for all items in the groups file.
    :param groups_file:  Input groups file showing clustering/grouping.
    :param output_fasta: Output file path.
    :param filetype: Either 'fasta' or 'fastq'
    :return: Filepath to the output fasta.
    """

    seeds = []
    seedSizes = parseGroupsFileToDictOfCounts(groups_file)

    printVerbose("Indexing reads")
    reads = SeqIO.index(input_fasta, filetype)
    printVerbose("Done indexing reads")

    printVerbose("Renaming sequences")
    for name, count in sorted(seedSizes.items(), key=operator.itemgetter(1), reverse=True):
        s = reads[name]
        s.id = "%s_%s" % (name, count)
        s.description = ""
        seeds.append(s)
        # write in chunks
        if len(seeds) == 500000:
            SeqIO.write(seeds, open(output_fasta, 'a'), filetype)
            seeds =[]
    # write the rest of the chunk
    SeqIO.write(seeds, open(output_fasta, 'a'), filetype)
    printVerbose("Done renaming sequences")
    return output_fasta
    def merge_chewbacca(self, input_f, outdir, output_filename, output_fileext):
        """Merges files together into a new output file.

        :param input_f: Filepath to a directory of input files.
        :param outdir: Filepath to the output folder.
        :param program: The program to use to merge files.  Choices are ["chewbacca"]. Default: "chewbacca".
        :param output_filename: The filename of the output file, without an extension.
        :param output_fileext: The file extension of the output file.
        :param aux_params: A dictionary of program-specific named-parameters.
        """
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "merged")
        printVerbose("Merging files.")
        output_file = "%s/%s_MERGED.%s" % (outdir, output_filename, output_fileext)
        merge_files(inputs, output_file)
        printVerbose("Done merging.")
        printVerbose("Merged %d files into %s" % (len(inputs), output_file))
    def visualize_otu_sample_comp(self, data_frame, output_file):
        """Creates a stacked barchart showing the OTU composition in each sample.

        :param data_frame: A pandas dataframe to graph.
        :param output_file: Filepath to the output graphics file.
        """
        ncols = len(data_frame.columns.values)
        nrows = len(data_frame.index.values)
        printVerbose("Computing dataframe values...")
        sums = data_frame.sum(0).values
        data_frame = data_frame.divide(sums)
        printVerbose("Transposing dataframe...")
        data_frame.transpose().plot(kind='bar', stacked=True, ylim=(0, 1), figsize=(10 + ncols / .5, 7 + nrows / 10.0),
                               colormap=plt.cm.hsv)
        plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=5)
        printVerbose("Saving image %s..." % output_file)
        plt.savefig(output_file, dpi=200)
Example #49
0
    def align_clean_macse(self, input_f, ref, samplesdir, outdir, processes, extraargstring=""):
        """Removes non-nucleotide characters in MACSE aligned sequences for all fasta files in the samples directory
            (the samplesDir argument).

        :param input_f: File path to file or folder of files to clean.
        :param samplesdir: Filepath to the original, unaligned input files (the inputs to the macse aligner).
        :param ref: Filepath to the reference file used to align the input files.
        :param outdir: Filepath to the directory to write outputs to.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        # "macse_format":     "java -jar " + programPaths["MACSE"] + "  -prog exportAlignment -align \"%s\" \
        #                           -charForRemainingFS - -gc_def 5 -out_AA \"%s\" -out_NT \"%s\" -statFile \"%s\""

        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        printVerbose("\t %s Processing MACSE alignments")
        samples_list = getInputFiles(samplesdir)
        run_parallel([ProgramRunner(ProgramRunnerCommands.MACSE_FORMAT,
                                    ["%s/%s_NT" % (input_f, getFileName(sample)),
                                     "%s/%s_AA_macse.fasta" % (outdir, getFileName(sample)),
                                     "%s/%s_NT_macse.fasta" % (outdir, getFileName(sample)),
                                     "%s/%s_macse.csv" % (outdir, getFileName(sample))],

                                    {"exists": ["%s/%s_NT" % (input_f, getFileName(sample))]}, extraargstring)
                      for sample in samples_list], pool)
        printVerbose("\tCleaning MACSE alignments")

        printVerbose("Processing %s samples..." % len(samples_list))
        nt_macse_outs = ["%s/%s_NT_macse.fasta" % (outdir, strip_ixes(sample)) for sample in samples_list]

        # Clean the alignments
        from classes.PythonRunner import PythonRunner
        run_parallel([PythonRunner(remove_refs_from_macse_out, [input_, ref,
                                   "%s/%s" % (outdir, "%s_cleaned.fasta" % strip_ixes(input_))],
                                   {"exists": [input_, ref]})
                      for input_ in nt_macse_outs], pool)

        # Cat the cleaned alignments
        cleaned_alignments = getInputFiles(outdir, "*_cleaned.fasta")
        merge_files(cleaned_alignments, "%s/MACSE_OUT_MERGED.fasta" % outdir)

        aux_dir = makeAuxDir(outdir)
        aux_files = getInputFiles(outdir, "*", "MACSE_OUT_MERGED.fasta", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)
        cleanup_pool(pool)
Example #50
0
    def merge_chewbacca(self, input_f, outdir, output_filename,
                        output_fileext):
        """Merges files together into a new output file.

        :param input_f: Filepath to a directory of input files.
        :param outdir: Filepath to the output folder.
        :param program: The program to use to merge files.  Choices are ["chewbacca"]. Default: "chewbacca".
        :param output_filename: The filename of the output file, without an extension.
        :param output_fileext: The file extension of the output file.
        :param aux_params: A dictionary of program-specific named-parameters.
        """
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "merged")
        printVerbose("Merging files.")
        output_file = "%s/%s_MERGED.%s" % (outdir, output_filename,
                                           output_fileext)
        merge_files(inputs, output_file)
        printVerbose("Done merging.")
        printVerbose("Merged %d files into %s" % (len(inputs), output_file))
Example #51
0
    def visualize_otu_sample_comp(self, data_frame, output_file):
        """Creates a stacked barchart showing the OTU composition in each sample.

        :param data_frame: A pandas dataframe to graph.
        :param output_file: Filepath to the output graphics file.
        """
        ncols = len(data_frame.columns.values)
        nrows = len(data_frame.index.values)
        printVerbose("Computing dataframe values...")
        sums = data_frame.sum(0).values
        data_frame = data_frame.divide(sums)
        printVerbose("Transposing dataframe...")
        data_frame.transpose().plot(kind='bar',
                                    stacked=True,
                                    ylim=(0, 1),
                                    figsize=(10 + ncols / .5,
                                             7 + nrows / 10.0),
                                    colormap=plt.cm.hsv)
        plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=5)
        printVerbose("Saving image %s..." % output_file)
        plt.savefig(output_file, dpi=200)
Example #52
0
def update_groups(old_groups_files, new_groups_files, out_dir, out_prefix):
    """Updates an old_groups file with the results of a new_groups file, and writes the results to a new groups file.
    E.g. Given the old_groups file lists:
    old_groups:
    1   2 3
    4   5 6
    and the new_groups file lists
    new_groups:
    1   4
    Then return an out_groups file listing
    out_groups
    1   2 3 4 5 6

    Finer points:
    1. The list of child sequences following the seed should not contain the seed.
    2. The size of the cluster_main represented by the seed is the number of children succeeding the seed,
            plus one for the seed.
    :param new_groups_files: The current iteration of the groups file.
    :param old_groups_files: The previous iteration of the groups file.
    :param out_dir: The resulting updated groups file.
    :param out_prefix: The prefix for the output filename.


    :return: Filepath to the updated groups file
    """
    if not (len(old_groups_files) and len(new_groups_files)):
        print "\n***WARNING***: Received empty file lists.  Aborting group file update.  If you provided a groups \
        file, something went wrong.\n"

        return
    printVerbose("Using %s and %s to generate updated groups file %s_updated.groups" % \
                     (old_groups_files[0], new_groups_files[0], out_prefix))
    old_groups_temp_file = "%s/temp_old_merged.groups" % out_dir
    new_groups_temp_file = "%s/temp_new_merged.groups" % out_dir
    output_file = "%s/%s_updated.groups" % (out_dir, out_prefix)

    # Concat the old and new groups files respectively
    merge_files(old_groups_files, old_groups_temp_file)
    merge_files(new_groups_files, new_groups_temp_file)

    # parse the groups files to dictionaries of children
    old_seeds = parseGroupsFileToDictOfChilden(old_groups_temp_file)
    new_seeds = parseGroupsFileToDictOfChilden(new_groups_temp_file)
    new_keys = new_seeds.keys()
    total = len(new_keys)
    i = 0
    outstring = ""
    with open(output_file, 'w') as output:
        for new_seed in new_keys:
            i += 1
            if i % 10000 == 0:
                printVerbose("Processed %d / %d lines\n" % (i, total))
                output.write(outstring)
                outstring = ""
            my_old_children = []
            children_of_my_new_children = []
            # Back in my day, I used to be a seed!
            if old_seeds.has_key(new_seed):
                my_old_children = old_seeds[new_seed].split(" ")
            my_new_children = new_seeds[new_seed].split(" ")
            for entry in my_new_children:
                if old_seeds.has_key(entry):
                    children_of_my_new_children += old_seeds[entry].split(" ")
            # list(set(my_old_children + my_new_children + children_of_my_new_children))
            all_my_children = my_old_children + my_new_children + children_of_my_new_children

            outstring += "%s\t%s\n" % (new_seed, " ".join(
                set(all_my_children)))
            # print "Seed %s has %d children" % (new_seed, len(all_my_children))
            # print("%s_%d = %d old  +  %d new  +   %d children of new" %
            #           (new_seed, len(all_my_children), len(my_old_children), len(my_new_children),
            #            len(children_of_my_new_children)))
        output.write(outstring)
        output.close()
    os.remove(old_groups_temp_file)
    os.remove(new_groups_temp_file)
    return output_file
Example #53
0
def buildOTUtable(latest_groups_files, inital_samples_files, barcodes_file, out_file):
    """Given a single barcodes file with all possible \
    sample names, a list of the latest groups file(s), and a list of initial samples files \
    (mapping each original, undereplicated sequence to its sample name), builds an OTU \
    table and writes it to out_file.

    :param latest_groups_files:  A list of the latest groups files.  No sequence name may occur in more than one \
                                    groups file.
    :param inital_samples_files: A list of the inital samples files.  This should map each sequence to its parent sample.
    :param barcodes_file:       A single barcodes file listing all valid sample names.
    :param out_file:            Filepath to the output directory
    """
    print "latest_groups_files: %s " % latest_groups_files
    print "inital_samples_files: %s " % inital_samples_files
    print "barcodes_file: %s " % barcodes_file
    
    
    seq_to_sample = {}
    # read the initaial groups/samples file (from rename)
    # make a single dict from all the groups/samples files mapping seqname to group
    all_sample_names = set()
    for samples_file in inital_samples_files:
        printVerbose("Reading samples file: %s" % samples_file)

        with open(samples_file, 'r') as current_samples_file:
            for line in current_samples_file:
                name, sample = line.split()
                sample_name = sample.rstrip()
                seq_to_sample[name] = sample_name
                all_sample_names.add(sample_name)
    all_sample_names = sorted(all_sample_names)

    printVerbose("Found the following sample names:")
    printVerbose(str(all_sample_names))
    #sys.exit(0)
    
    with open(out_file, 'w') as out:
        header_line = "OTU"
        for sample in all_sample_names:
            header_line += "\t%s" % sample
        out.write(header_line + "\n")
        # GENERATE A DICTIONARY MAPPING SEQUENCE NAMES TO THE SAMPLE THEY CAME FROM
        # for each line in the latest groups files,
        for groups_file in latest_groups_files:
            with open(groups_file, 'r') as current_groups_file:
                otu = ""
                children = ""
                # read the latest groups file
                for line in current_groups_file:
                    data = line.split("\t")

                    # TODO: if line is empty... need to find the readson for this
                    if not data:
                        continue
                    # found a cluster_main
                    if len(data) == 2:
                        otu = data[0].rstrip()
                        children = data[1].rstrip()

                    # found a singleton
                    elif len(data) == 1:
                        otu = data[0].rstrip()

                    # found a blank line
                    else:
                        pass

                    # GENERATE OTU ABUNDANCE BY SAMPLE
                    # initalize a count_dict with each sample as a key and a value of 0
                    sample_counts = {}
                    for sample_name in all_sample_names:
                        sample_counts[sample_name] = 0
                    # for each item in the child list:
                    for child in children.split():
                        # my_sample = lookup that item in the dict to get its sample name
                        my_sample = seq_to_sample[child]
                        # increment the abundance in that sample
                        sample_counts[my_sample] += 1


                    # WRITE THE COUNTS TO THE OUT FILE
                    # for each sample in the barcodes list, write otu to a txt file as a single line
                    out_line = otu
                    
                    for sample_name in all_sample_names:
                        out_line += "\t%s" % sample_counts[sample_name]
                    out.write(out_line + "\n")
    out.close()
Example #54
0
File: arms.py Project: mahdi-b/ARMS
def preprocessData(args, pool=Pool(processes=1)):
   # TODO: test run.name is a single word


   makeDirOrdie(args.outDir) 



   printVerbose("Preprocessing the data:")

   # *****************************************************************************************
   printVerbose("\t Renaming sequences")
   # "~/programs/fastx/bin/fastx_renamer -n COUNT -i %s %s"
   rename_outFile_f = os.path.join("outDir/", os.path.basename(args.input_f)+"_renamed")
   rename_outFile_r = os.path.join("outDir/", os.path.basename(args.input_r)+"_renamed")

   pool.map(runInstance, [ProgramRunner("fastx_renamer",[args.input_f, rename_outFile_f], {"exists":[args.input_f]}),
                          ProgramRunner("fastx_renamer",[args.input_r, rename_outFile_r], {"exists":[args.input_r]}),
                          ])
   printVerbose("\tRenamed X sequences")
   # *****************************************************************************************
   # Making the contigs using Pear
   # "~/programs/pear-0.9.4-bin-64/pear-0.9.4-64 -f %s -r %s -o %s -j %s"
   assembledPrefix = os.path.join("outDir", args.name)
   pool.map(runInstance, [ProgramRunner("pear", 
                                           (rename_outFile_f, rename_outFile_r, assembledPrefix, args.threads), 
                                           {"exists":[rename_outFile_f, rename_outFile_r]}) 
                          ])
   assembledFastqFile = os.path.join("outDir", args.name+".assembled.fastq")
   # add py char to a web-page
   printVerbose("\t %s sequences assembled, %s contigs discarded, %s sequences discarded" % (1,1,1))


   # *****************************************************************************************
   # Converting fastq to fasta file (do with mothur or BioSeqIO to keep prog deps to a minimum)
   pool.map(runInstance, [ProgramRunner("fastq.info", 
                                           [assembledFastqFile], 
                                           {"exists": [assembledFastqFile]}) 
                          ])
   assembledFastaFile = os.path.splitext(assembledFastqFile)[0]+".fasta"
   # TODO: add py char to a web-page
   printVerbose("\t converted fastq to fasta")
   # *****************************************************************************************
   # Trimming and assigning reads to groups
   # trim.seqs(fasta=%, oligos=%s, maxambig=0, maxhomop=8, minlength=300, maxlength=550, bdiffs=1, pdiffs=2)

   pool.map(runInstance, [ProgramRunner("trim.seqs",
                                        [assembledFastaFile, args.barcodes],
                                        {"exists": [assembledFastaFile]})
                          ])
   printVerbose("\t %s sequences were assigned to groups and %s sequences were discareded")
   trimmedFasaFile = os.path.splitext(assembledFastqFile)[0]+".trim.fasta"
   # *****************************************************************************************
   # Aligning against the BIOCODETEMPLATE database
   pool.map(runInstance, [ProgramRunner("align.seqs",
                                        [trimmedFasaFile, args.db],
                                        {"exists": [trimmedFasaFile]})
                          ])
   printVerbose("\t %s sequences were assigned to groups and %s sequences were discareded")
    def cluster_crop(self, input_f, outdir, groupsfile, processes, blocksize, clustpct, maxmcmc, maxsm, rare,
                     blockcount, extraargstring):
        """Clusters sequences using CROP.

        :param input_f: Filepath to the input fasta file to cluster.
        :param outdir: Filepath to the output directory.
        :param groupsfile: Filepath to the groups file to use as a reference for dereplication counting.
        :param blocksize: Size of blocks to be used for all rounds (if -b is specified, then -z will not affect the
                            first round.  For data set with different average sequence length, this parameter should \
                            be tuned such that it won't take too long for each block to do pariwise alignment.  Hint \
                            for choosing z: z*L<150,000, where L is the average length of the sequences.
        :param clustpct: The minimum similarity threshold for clustering.  Either 'g' for 95% or 's' for 97%.
        :param maxmcmc: This parameter specifies the number of iterations of MCMC. Default value is 2000. Increase \
                            this value to enhance accuracy (recommended value is at least 10*block size).
        :param maxsm: This parameter specifies the maximum number of 'split and merge' process to run.  Max is 20.
        :param rare: The maximum cluster size allowed to be classified as 'rare'. Clusters are defined as either \
                            'abundant' or 'rare'. 'Abundant' clusters will be clustered first, then the 'rare' \
                            clusters are mapped to the 'abundant' clusters.  Finally, 'rare' clusters which cannot be \
                            mapped will be clustered separately. e.g. If r=5, the clusters with size <=5 will be \
                            considered 'rare' in above procedure. and r=0 will yield the best accuracy. If you \
                            believe your data is not too diverse to be handled, then r=0 will be the best choice.
        :param blockcount: The size of blocks in the first round of clustering. Hint of choosing -b: Each block in the \
                            first round should contain about 50 sequences.  i.e. b=N/50, where N is the number of \
                            input sequences.  Default: # input sequences / z.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """

        # Grab the fasta file(s) to cluster
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clustered")
        pool = init_pool(min(len(inputs), processes))

        # RUN CLUSTERING
        # crop -i %s -o %s -z %s -c %s -e %s -m %s%s
        run_parallel([ProgramRunner(ProgramRunnerCommands.CLUSTER_CROP,
                                    [input_, "%s/%s" % (outdir, strip_ixes(input_)), blocksize, clustpct,
                                        maxmcmc, maxsm, rare, blockcount],
                                    {"exists": [input_]}, extraargstring) for input_ in inputs], pool)

        # CLEAN THE OUTPUT GROUPS FILE
        printVerbose("Parsing the groups file from clustering")
        clustered_groups_files = getInputFiles(outdir, "*.cluster.list")
        debugPrintInputInfo(clustered_groups_files, "converted to groups files")
        run_parallel([PythonRunner(parseCROPoutToGroups, [input_,
                                   "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))],
                                   {"exists": [input_]})
                      for input_ in clustered_groups_files], pool)
        printVerbose("Done parsing groups file.")

        # Collect the groups file from clustering with counts removed
        cleaned_clustered_groups_files = getInputFiles(outdir, "*_uncount.groups", ignore_empty_files=False)

        # Resolve the user specified names file if necessary
        final_groups_files = handle_groups_file_update(outdir, groupsfile, cleaned_clustered_groups_files)

        # GATHER AUX FILES
        input_dir = getDirName(input_f)
        aux_files = cleaned_clustered_groups_files
        aux_files += getInputFiles(input_dir, "*.unique", ignore_empty_files=False)
        aux_files += getInputFiles(input_dir, "*.unique.list", ignore_empty_files=False)
        aux_files += getInputFiles(input_dir, "*.unique.TempCenters.Rare", ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.cluster", ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.cluster.list", ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.log", ignore_empty_files=False)
        aux_files += getInputFiles(".", "LikelihoodRatio.txt", ignore_empty_files=False)

        # Move the final groups file(s) to the groups dir
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(final_groups_files, groups_dir)

        # Move aux files to the aux dir
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Cleanup the pool
        cleanup_pool(pool)