Exemple #1
0
def main():
    # get args and options
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # get the input data
    log.info("Getting input filenames and creating output directories")
    input = get_input_data(args.config, args.dir)
    # create the output directory if it does not exist
    if not os.path.isdir(args.output):
        os.makedirs(args.output)
    else:
        pass
    # make the symlink directory within the output directory
    contig_dir = os.path.join(args.output, 'contigs')
    if not os.path.isdir(contig_dir):
        os.makedirs(contig_dir)
    else:
        pass
    try:
        abyss_pe = which('abyss-pe')[0]
        abyss_se = which('ABYSS')[0]
    except:
        raise EnvironmentError("Cannot find abyss-pe or ABYSS.  Ensure they "
                               "are installed and in your $PATH")
    # run abyss in (mostly) single-threaded mode for RAM and simplicity
    # reasons.  abyss-map will run using as many cores as user specifies.
    for group in input:
        sample, dir = group
        # pretty print taxon status
        text = " Processing {} ".format(sample)
        log.info(text.center(65, "-"))
        # make a directory for sample-specific assemblies
        sample_dir = os.path.join(args.output, sample)
        os.makedirs(sample_dir)
        # determine how many files we're dealing with
        reads = get_input_files(dir, args.subfolder, log)
        # copy the read data over, combine singletons with read 1
        # and run the assembly for PE data.
        if reads.r1 and reads.r2:
            output = run_abyss_pe(abyss_pe, args.kmer, reads, args.cores,
                                  sample_dir, log)
            if args.clean:
                cleanup_abyss_assembly_folder(output, log)
        elif reads.r1 and not reads.r2:
            output = run_abyss_se(abyss_se, args.kmer, reads,
                                  sample_dir, log)
            if args.clean:
                cleanup_abyss_assembly_folder(output, log, single_end=True)
        contigs_file = get_contigs_file_from_output(output)
        # remove degenerate bases, contigs < 100 bp, and rename
        # contigs to velvet-style naming
        contigs_file = convert_abyss_contigs_to_velvet(contigs_file)
        # create generic link in assembly folder for covg. computation
        generate_within_dir_symlink(contigs_file)
        # link to the standard (non-trimmed) assembly in ../contigs
        generate_symlinks(contig_dir, sample, contigs_file, log)
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
Exemple #2
0
def main():
    # get args and options
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # get the input data
    log.info("Getting input filenames and creating output directories")
    input = get_input_data(args.config, args.dir)
    # create the output directory if it does not exist
    if not os.path.isdir(args.output):
        os.makedirs(args.output)
    else:
        pass
    # make the symlink directory within the output directory
    contig_dir = os.path.join(args.output, 'contigs')
    if not os.path.isdir(contig_dir):
        os.makedirs(contig_dir)
    else:
        pass
    try:
        velveth = which('velveth')[0]
        velvetg = which('velvetg')[0]
    except:
        raise EnvironmentError("Cannot find velveth or velvetg.  Ensure they "
                               "are installed and in your $PATH")
    # run velvet in single-threaded mode for RAM and simplicity
    # reasons.
    for group in input:
        sample, dir = group
        # pretty print taxon status
        text = " Processing {} ".format(sample)
        log.info(text.center(65, "-"))
        # make a directory for sample-specific assemblies
        sample_dir = os.path.join(args.output, sample)
        os.makedirs(sample_dir)
        # determine how many files we're dealing with
        reads = get_input_files(dir, args.subfolder, log)
        # copy the read data over, combine singletons with read 1
        # and run the assembly for PE data.
        if reads.r1 and reads.r2:
            output = run_velveth(velveth, args.kmer, reads, sample_dir, log)
            output = run_velvetg(velvetg, args.kmer, output, log)
        elif reads.r1 and not reads.r2 and not reads.singleton:
            pass
        if args.clean:
            cleanup_velvet_assembly_folder(output, log)
        contigs_file = get_contigs_file_from_output(output)
        # create generic link in assembly folder for covg. computation
        generate_within_dir_symlink(sample_dir, contigs_file)
        # link to the standard (non-trimmed) assembly in ../contigs
        generate_symlinks(contig_dir, sample, contigs_file, log)
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
Exemple #3
0
def get_velvet_optimiser(name='VelvetOptimiser'):
    """ensure that velvetg, velveth, VelvetOpt, and VelvetOptimiser are in $PATH"""
    # ensure velvetg and velveth are in $PATH
    velvetg = which("velvetg")
    velveth = which("velveth")
    velvet_opt = which("VelvetOpt")
    # we need velvetoptimiser - ensure that is in $PATH and return
    try:
        velvet_optimiser = which("{}".format(name))[0]
        return velvet_optimiser
    except EnvironmentError, e:
        velvet_optimiser = which("{}.pl".format(name))[0]
        return velvet_optimiser
Exemple #4
0
def main():
    # get args and options
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # get the input data
    log.info("Getting input filenames and creating output directories")
    input = get_input_data(args.config, args.dir)
    # create the output directory if it does not exist
    if not os.path.isdir(args.output):
        os.makedirs(args.output)
    else:
        pass
    # make the symlink directory within the output directory
    contig_dir = os.path.join(args.output, 'contigs')
    if not os.path.isdir(contig_dir):
        os.makedirs(contig_dir)
    else:
        pass
    # Get path to trinity.  Standard name is `Trinity.pl`.
    # I usually symlink to `trinity`
    #TODO:  Change this to system "which" - this is just to flaky in certain cases
    try:
        trinity = which('trinity')[0]
    except EnvironmentError:
        trinity = which('Trinity.pl')[0]
    except:
        raise EnvironmentError("Cannot find Trinity.  Ensure it is installed and in your $PATH")
    for group in input:
        sample, dir = group
        # pretty print taxon status
        text = " Processing {} ".format(sample)
        log.info(text.center(65, "-"))
        # make a directory for sample-specific assemblies
        sample_dir = os.path.join(args.output, sample)
        os.makedirs(sample_dir)
        # determine how many files we're dealing with
        reads = get_input_files(dir, args.subfolder, log)
        # copy the read data over, combine singletons with read 1
        # and run the assembly for PE data.
        if reads.r1 and reads.r2 and reads.singleton:
            copy_read_data(reads, sample_dir, log)
            combine_read_data(reads, log)
            output = run_trinity_pe(trinity, reads, args.cores, args.min_kmer_coverage, log)
            if args.clean:
                cleanup_trinity_assembly_folder(output, log)
        # we don't need to combine singleton files here.  copy
        # the read data over and run the assembly for PE data
        elif reads.r1 and reads.r2:
            copy_read_data(reads, sample_dir, log)
            output = run_trinity_pe(trinity, reads, args.cores, args.min_kmer_coverage, log)
            if args.clean:
                cleanup_trinity_assembly_folder(output, log)
        # here, we don't have PE data, so copy the file over
        # and run the assembly for SE data
        elif reads.r1:
            copy_read_data(reads, sample_dir, log)
            output = run_trinity_se(trinity, reads, args.cores, args.min_kmer_coverage, log)
            if args.clean:
                cleanup_trinity_assembly_folder(output, log)
        # generate symlinks to assembled contigs
        generate_symlinks(contig_dir, sample, reads, log)
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    # get args and options
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # get the input data
    log.info("Getting input filenames")
    input = get_input_data(args.assemblo_config, None)
    # Get path to bwa
    try:
        bwa = which('bwa')[0]
    except:
        raise EnvironmentError("Cannot find bwa.  Ensure it is installed and in your $PATH")
    # make the symlink directory within the output directory
    contig_dir = os.path.join(args.assemblies, 'contigs-trimmed')
    if not os.path.isdir(contig_dir):
        os.makedirs(contig_dir)
    else:
        pass
    for group in input:
        sample, reads = group
        # pretty print taxon status
        text = " Processing {} ".format(sample)
        log.info(text.center(65, "-"))
        # ensure that assembly exists
        assembly_pth = os.path.join(args.assemblies, sample)
        assembly = os.path.join(assembly_pth, "contigs.fasta")
        if not os.path.exists(assembly):
            raise IOError("Assembly for {} does not appear to exist.".format(sample))
        if args.clean:
            cleanup_trinity_assembly_folder(log, assembly_pth)
        # determine the types of raw read data that we have
        fastq = get_input_files(reads, args.subfolder, log)
        # create the bwa index
        bwa_create_index_files(log, assembly)
        samtools_create_faidx(log, sample, assembly_pth, assembly)
        picard_create_reference_dict(log, sample, assembly_pth, assembly)
        bam = False
        bam_se = False
        if args.bwa_mem and fastq.r1 and fastq.r2:
            bam = bwa_mem_pe_align(log, sample, assembly_pth, assembly, args.cores, fastq.r1, fastq.r2)
            bam = picard_clean_up_bam(log, sample, assembly_pth, bam, "pe")
            bam = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam, "pe")
        elif not args.bwa_mem and fastq.r1 and fastq.r2:
            bam = bwa_pe_align(log, sample, assembly_pth, assembly, args.cores, fastq.r1, fastq.r2)
            bam = picard_clean_up_bam(log, sample, assembly_pth, bam, "pe")
            bam = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam, "pe")
        # get singleton reads for alignment
        if args.bwa_mem and fastq.singleton:
            bam_se = bwa_mem_se_align(log, sample, assembly_pth, assembly, args.cores, fastq.singleton)
            bam_se = picard_clean_up_bam(log, sample, assembly_pth, bam_se, 'se')
            bam_se = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam_se, "se")
        # if we only have se reads, those will be in fastq.r1 only
        elif args.bwa_mem and not fastq.r2 and fastq.r1:
            bam_se = bwa_mem_se_align(log, sample, assembly_pth, assembly, args.cores, fastq.r1)
            bam_se = picard_clean_up_bam(log, sample, assembly_pth, bam_se, 'se')
            bam_se = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam_se, "se")
        elif not args.bwa_mem and fastq.singleton:
            bam_se = bwa_se_align(log, sample, assembly_pth, assembly, args.cores, fastq.singleton)
            bam_se = picard_clean_up_bam(log, sample, assembly_pth, bam_se, 'se')
            bam_se = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam_se, "se")
        elif not args.bwa_mem and not fastq.r2 and fastq.r1:
            bam_se = bwa_se_align(log, sample, assembly_pth, assembly, args.cores, fastq.r1)
            bam_se = picard_clean_up_bam(log, sample, assembly_pth, bam_se, 'se')
            bam_se = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam_se, "se")
        if bam and bam_se:
            bam = picard_merge_two_bams(log, sample, assembly_pth, bam, bam_se)
        elif bam_se and not bam:
            bam = bam_se
        if not bam:
            raise IOError("There is no BAM file.  Check bwa log files for problems.")
        samtools_index(log, sample, assembly_pth, bam)
        coverage = gatk_coverage(log, sample, assembly_pth, assembly, args.cores, bam)
        overall_contigs = get_coverage_from_gatk(log, sample, assembly_pth, coverage, args.velvet)
        remove_gatk_coverage_files(log, assembly_pth, coverage)
        trimmed_fasta_path = filter_screened_contigs_from_assembly(log, sample, assembly_pth, assembly, overall_contigs)
        symlink_trimmed_contigs(log, sample, contig_dir, trimmed_fasta_path)
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))