def main(args):
    # setup logging
    log, my_name = setup_logging(args)
    # create the fasta dictionary
    loci = get_fasta_dict(log, args)
    log.info("Aligning with {}".format(str(args.aligner).upper()))
    opts = [[args.window, args.threshold, args.no_trim, args.proportion, args.max_divergence, args.min_length] \
            for i in range(len(loci))]
    # combine loci and options
    params = zip(loci.items(), opts)
    log.info("Alignment begins. 'X' indicates dropped alignments (these are reported after alignment)")
    # During alignment, drop into sys.stdout for progress indicator
    # because logging in multiprocessing is more painful than what
    # we really need.  Return to logging when alignment completes.
    if args.cores > 1:
        assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have"
        pool = multiprocessing.Pool(args.cores)
        alignments = pool.map(align, params)
    else:
        alignments = map(align, params)
    # kick the stdout down one line since we were using sys.stdout
    print("")
    # drop back into logging
    log.info("Alignment ends")
    # write the output files
    write_alignments_to_outdir(log, args.output, alignments, args.output_format)
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args.verbosity, args.log_path)
    text = " Starting {} ".format(my_name)
    log.info(text.center(65, "="))
    alignments = []
    log.info("Getting aligned sequences for trimming")
    for ftype in get_file_extensions(args.input_format):
        alignments.extend(glob.glob(os.path.join(args.input, "*{}".format(ftype))))
    # package up needed arguments for map()
    package = [args.input_format, args.window, args.threshold, args.proportion, args.max_divergence, args.min_length]
    params = zip([package] * len(alignments), alignments)
    log.info("Alignment begins. 'X' indicates dropped alignments (these are reported after alignment)")
    # if --multprocessing, use Pool.map(), else use map()
    # can also extend to MPI map, but not really needed on multicore
    # machine
    if args.cores > 1:
        assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have"
        pool = multiprocessing.Pool(args.cores - 1)
        alignments = pool.map(get_and_trim_alignments, params)
    else:
        alignments = map(get_and_trim_alignments, params)
    # kick the stdout down one line since we were using sys.stdout
    print("")
    # drop back into logging
    log.info("Alignment ends")
    # write the output files
    write_alignments_to_outdir(log, args.output, alignments, args.output_format)
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    files = get_files(args.input, args.input_format)
    for f in files:
        try:
            aln = AlignIO.read(f, args.input_format)
            if args.containing:
                containing = align_contains_taxa(args, aln)
            else:
                containing = True
            if args.min_length:
                length = align_min_length(args, aln)
            else:
                length = True
            if args.min_taxa:
                taxa = align_min_taxa(args, aln)
            else:
                taxa = True
            if containing and taxa and length:
                log.info("Good alignment: {0}".format(os.path.basename(f)))
            if containing and taxa and length and args.output:
                name = os.path.basename(f)
                shutil.copy(f, os.path.join(args.output, name))
        except ValueError, e:
            if e.message == 'No records found in handle':
                print 'No records found in {0}'.format(os.path.basename(f))
            else:
                raise ValueError('Something is wrong with alignment {0}'.format(os.path.basename(f)))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    #text = " Starting {} ".format(my_name)
    #log.info(text.center(65, "="))
    # find all alignments
    files = get_alignment_files(log, args.alignments, args.input_format)
    # compile our regexes once
    n_bases = re.compile("N|n+")
    x_bases = re.compile("X|x+")
    work = [[file, n_bases, x_bases, args.input_format, args.output, args.do_not_screen_n, args.do_not_screen_x] for file in files]
    log.info("Screening alignments for problematic bases".format(args.cores))
    if args.cores > 1:
        assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have"
        pool = multiprocessing.Pool(args.cores)
        results = pool.map(screen_files, work)
        pool.close()
    else:
        results = map(screen_files, work)
    count = 0
    for result in results:
        if result is None:
            count += 1
        else:
            log.warn("Removed locus {} due to presence of {} bases".format(
                result[0],
                result[1]
            ))
    log.info("Copied {} good alignments".format(count))
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # read alignments
    log.info("Reading input alignments in NEXUS format")
    nexus_files = glob.glob(os.path.join(args.alignments, '*.nex*'))
    data = [(os.path.basename(fname), Nexus.Nexus(fname)) for fname in nexus_files]
    log.info("Concatenating files")
    concatenated = Nexus.combine(data)
    if not args.nexus:
        concat_file = os.path.join(args.output, os.path.basename(args.alignments) + ".phylip")
        if args.charsets:
            sets = concatenated.append_sets()
            charset_file = os.path.join(args.output, os.path.basename(args.alignments) + ".charsets")
            log.info("Writing charsets to {}".format(
                charset_file
            ))
            with open(charset_file, 'w') as outf:
                outf.write(sets)
        log.info("Writing concatenated PHYLIP alignment to {}".format(concat_file))
        concatenated.export_phylip(concat_file)
    else:
        concat_file = os.path.join(args.output, os.path.basename(args.alignments) + ".nexus")
        if args.charsets:
            log.info("Writing concatenated alignment to NEXUS format (with charsets)")
            concatenated.write_nexus_data(concat_file)
        else:
            log.info("Writing concatenated alignment to NEXUS format (without charsets)")
            concatenated.write_nexus_data(concat_file, append_sets=False)
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    files = get_files(args.alignments, args.input_format)
    if len(files) == 0:
        raise IOError("There are no {}-formatted alignments in {}.".format(
            args.input_format,
            args.alignments
        ))
    if args.shorten_name and not args.name_conf:
        name_map = shorten_name(args, files[0])
    elif args.shorten_name and args.name_conf:
        conf = ConfigParser.ConfigParser()
        conf.readfp(open(args.name_conf))
        name_map = dict(conf.items('taxa'))
    else:
        name_map = None
    params = [[f, args, name_map] for f in files]
    sys.stdout.write('Converting')
    sys.stdout.flush()
    if args.cores > 1:
        pool = Pool(args.cores)
        pool.map(convert_files_worker, params)
    else:
        map(convert_files_worker, params)
    print ""
    if args.shorten_name:
        log.info("Taxa renamed (from) => (to):")
        for k, v in name_map.iteritems():
            log.info("\t{0} => {1}".format(k, v))
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # get input files
    files = get_alignment_files(log, args.alignments, args.input_format)
    sys.stdout.write("Running")
    sys.stdout.flush()
    with open(args.output, 'w') as outf:
        for f in files:
            aln = AlignIO.read(f, args.input_format)
            locus = os.path.splitext(os.path.basename(f))[0]
            for taxon in aln:
                if taxon.id == args.taxon:
                    seq = str(taxon.seq).replace('-', '').replace('?','')
                    record = SeqRecord(Seq(seq), id=locus, name="", description="")
                    if not len(seq) == 0:
                        outf.write(record.format("fasta"))
                        sys.stdout.write(".")
                        sys.stdout.flush()
                    else:
                        log.info("Could not write {}".format(locus))
    print ""
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
Exemple #8
0
def main():
    # get args and options
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # get the input data
    log.info("Getting input filenames and creating output directories")
    input = get_input_data(args.config, args.dir)
    # create the output directory if it does not exist
    if not os.path.isdir(args.output):
        os.makedirs(args.output)
    else:
        pass
    # make the symlink directory within the output directory
    contig_dir = os.path.join(args.output, 'contigs')
    if not os.path.isdir(contig_dir):
        os.makedirs(contig_dir)
    else:
        pass
    try:
        abyss_pe = which('abyss-pe')[0]
        abyss_se = which('ABYSS')[0]
    except:
        raise EnvironmentError("Cannot find abyss-pe or ABYSS.  Ensure they "
                               "are installed and in your $PATH")
    # run abyss in (mostly) single-threaded mode for RAM and simplicity
    # reasons.  abyss-map will run using as many cores as user specifies.
    for group in input:
        sample, dir = group
        # pretty print taxon status
        text = " Processing {} ".format(sample)
        log.info(text.center(65, "-"))
        # make a directory for sample-specific assemblies
        sample_dir = os.path.join(args.output, sample)
        os.makedirs(sample_dir)
        # determine how many files we're dealing with
        reads = get_input_files(dir, args.subfolder, log)
        # copy the read data over, combine singletons with read 1
        # and run the assembly for PE data.
        if reads.r1 and reads.r2:
            output = run_abyss_pe(abyss_pe, args.kmer, reads, args.cores,
                                  sample_dir, log)
            if args.clean:
                cleanup_abyss_assembly_folder(output, log)
        elif reads.r1 and not reads.r2:
            output = run_abyss_se(abyss_se, args.kmer, reads,
                                  sample_dir, log)
            if args.clean:
                cleanup_abyss_assembly_folder(output, log, single_end=True)
        contigs_file = get_contigs_file_from_output(output)
        # remove degenerate bases, contigs < 100 bp, and rename
        # contigs to velvet-style naming
        contigs_file = convert_abyss_contigs_to_velvet(contigs_file)
        # create generic link in assembly folder for covg. computation
        generate_within_dir_symlink(contigs_file)
        # link to the standard (non-trimmed) assembly in ../contigs
        generate_symlinks(contig_dir, sample, contigs_file, log)
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
Exemple #9
0
def main():
    # get args and options
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # get the input data
    log.info("Getting input filenames and creating output directories")
    input = get_input_data(args.config, args.dir)
    # create the output directory if it does not exist
    if not os.path.isdir(args.output):
        os.makedirs(args.output)
    else:
        pass
    # make the symlink directory within the output directory
    contig_dir = os.path.join(args.output, 'contigs')
    if not os.path.isdir(contig_dir):
        os.makedirs(contig_dir)
    else:
        pass
    try:
        velveth = which('velveth')[0]
        velvetg = which('velvetg')[0]
    except:
        raise EnvironmentError("Cannot find velveth or velvetg.  Ensure they "
                               "are installed and in your $PATH")
    # run velvet in single-threaded mode for RAM and simplicity
    # reasons.
    for group in input:
        sample, dir = group
        # pretty print taxon status
        text = " Processing {} ".format(sample)
        log.info(text.center(65, "-"))
        # make a directory for sample-specific assemblies
        sample_dir = os.path.join(args.output, sample)
        os.makedirs(sample_dir)
        # determine how many files we're dealing with
        reads = get_input_files(dir, args.subfolder, log)
        # copy the read data over, combine singletons with read 1
        # and run the assembly for PE data.
        if reads.r1 and reads.r2:
            output = run_velveth(velveth, args.kmer, reads, sample_dir, log)
            output = run_velvetg(velvetg, args.kmer, output, log)
        elif reads.r1 and not reads.r2 and not reads.singleton:
            pass
        if args.clean:
            cleanup_velvet_assembly_folder(output, log)
        contigs_file = get_contigs_file_from_output(output)
        # create generic link in assembly folder for covg. computation
        generate_within_dir_symlink(sample_dir, contigs_file)
        # link to the standard (non-trimmed) assembly in ../contigs
        generate_symlinks(contig_dir, sample, contigs_file, log)
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    # get args
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    log.info("Logging set-up")

    matches = get_match_records(log, args.input_xml, args.chromosome)
    matches_names = fix_labels(matches, log)
    vcf.output = create_vcf(log, args.input_vcf, args.output_vcf,
                            matches_names)
    log.info("Done!")
def main():
    # get args and options
    args = get_args()
    # if we're resuming, we need to set output = resume
    # so that we can check previously created files.
    if args.resume:
        args.output = args.resume
    # setup logging
    log, my_name = setup_logging(args)
    log.info("Creating the output directory")
    # get the input data
    log.info("Fetching input filenames")
    assemblies = sorted(glob.glob(os.path.join(args.assemblies, "*")))
    # remove the contigs/contigs-trimmed directories
    extra = set(['contigs', 'contigs-trimmed'])
    assemblies = [assembly for assembly in assemblies if os.path.basename(assembly) not in extra]
    loci = get_match_count_loci(log, args.match_count_output)
    # setup database connection
    conn = sqlite3.connect(args.locus_db)
    cur = conn.cursor()
    for assembly in assemblies:
        organism = os.path.basename(assembly)
        if args.resume and os.path.exists(os.path.join(args.output, "{}.reads-on-target.txt".format(organism))):
            log.warn("Skipping previously processed {} data (--resume)".format(organism))
        else:
            reference = os.path.join(assembly, "contigs.fasta")
            bams = glob.glob(os.path.join(assembly, "*.bam"))
            try:
                assert len(bams) == 1
                bam = bams[0]
            except:
                raise IOError("There appears to be more than one BAM file for {}".format(organism))
            # pretty print taxon status
            text = " Processing {} ".format(organism)
            log.info(text.center(65, "-"))
            locus_map = get_sqlite_loci_for_taxon(log, args.locus_db, cur, organism, loci)
            locus_map_names = set(locus_map.keys())
            create_per_base_coverage_file(log, args.output, assembly, organism, locus_map, locus_map_names)
            coverages_dict = create_per_locus_coverage_file(log, args.output, assembly, organism, locus_map, locus_map_names)
            # pass the same intervals as targets and base - we don't care that much here about bait performance
            hs_metrics_file = picard_calculate_hs_metrics(log, organism, args.output, reference, bam, coverages_dict["interval_list"], coverages_dict["interval_list"])
            on_target_dict = picard_get_percent_reads_on_target(log, hs_metrics_file, organism)
            log.info("\t{} contigs, mean trimmed length = {:.1f}, mean trimmed coverage = {:.1f}x, on-target bases (uce contigs) = {:.1f}%, unique reads aligned (all contigs) = {:.1f}%".format(
                coverages_dict["count"],
                coverages_dict["mean_length_trimmed"],
                coverages_dict["mean_trim_cov"],
                float(on_target_dict["PCT_SELECTED_BASES"]) * 100,
                float(on_target_dict["PCT_PF_UQ_READS_ALIGNED"]) * 100,
            ))
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    env = Environment(loader=FileSystemLoader(args.templates))
    for i in xrange(args.trees):
        submit_script_pth = compute_starting_parsimony_tree(log, args, env, i)
        submit_parsimony_job(log, args, env, i, submit_script_pth)
    # convert the phylip file to binary format
    submit_script_pth = prep_parser_script(log, args, env)
    submit_parser_job(log, args, env, submit_script_pth)
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    env = Environment(loader=FileSystemLoader(args.templates))
    # check for the binary aligment
    binary_name = check_for_binary_phylip(log, args)
    # check for starting trees
    starting_trees = get_starting_trees(log, args)
    # create the binary file
    for starting_tree in starting_trees:
        prep_examl_script(log, args, env, starting_tree, binary_name)
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
Exemple #14
0
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    env = Environment(loader=FileSystemLoader(args.templates))
    # check for the binary aligment
    binary_name = check_for_binary_phylip(log, args)
    # check for starting trees
    starting_trees = get_starting_trees(log, args)
    # create the binary file
    for starting_tree in starting_trees:
        prep_examl_script(log, args, env, starting_tree, binary_name)
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # read config file output by match_count_config.py
    config = ConfigParser.RawConfigParser(allow_no_value=True)
    # make case sensitive
    config.optionxform = str
    config.read(args.match_count_output)
    # read the incomplete matrix file that contains loci that are incomplete
    if args.incomplete_matrix:
        incomplete = ConfigParser.RawConfigParser(allow_no_value=True)
        incomplete.optionxform = str
        incomplete.read(args.incomplete_matrix)
        missing = get_missing_loci_from_conf_file(incomplete)
    else:
        missing = None
    # get the taxa in the alignment
    organisms = get_names_from_config(log, config, 'Organisms')
    # get input files
    files = get_alignment_files(log, args.alignments, args.input_format)
    work = [[
            file,
            args.input_format,
            organisms,
            args.check_missing,
            missing,
            args.verbatim,
            args.min_taxa,
            args.output,
            args.output_format
        ] for file in files
    ]
    log.info("Adding missing data designators using {} cores".format(args.cores))
    if args.cores > 1:
        assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have"
        pool = multiprocessing.Pool(args.cores)
        results = pool.map(add_designators, work)
    else:
        results = map(add_designators, work)
    for result in results:
        if result is not None:
            log.info("Dropped {} because of too few taxa (N < {})".format(
                result,
                args.min_taxa
            ))
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # change to working dir
    starting_dir = os.getcwd()
    # convert data to binary
    binary_file_pth = convert_phylip_to_examl_binary(log, args)
    for iter in xrange(args.trees):
        # compute starting tree on data
        seed, starting_tree_pth = compute_starting_parsimony_tree(log, args, iter, binary_file_pth)
        # run examl against binary data with starting tree
        run_examl_against_binary_data(log, args, iter, binary_file_pth, starting_tree_pth)
    # return to starting dir
    os.chdir(starting_dir)
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # change to working dir
    starting_dir = os.getcwd()
    # convert data to binary
    binary_file_pth = convert_phylip_to_examl_binary(log, args)
    for iter in xrange(args.trees):
        # compute starting tree on data
        seed, starting_tree_pth = compute_starting_parsimony_tree(
            log, args, iter, binary_file_pth)
        # run examl against binary data with starting tree
        run_examl_against_binary_data(log, args, iter, binary_file_pth,
                                      starting_tree_pth)
    # return to starting dir
    os.chdir(starting_dir)
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    conf = ConfigParser.ConfigParser()
    conf.optionxform = str
    conf.read(args.config)
    items = conf.items("samples")
    #pdb.set_trace()
    for item in items:
        name, file_names = item
        files = file_names.strip().split(",")
        with open(os.path.join(args.output, name), 'wb') as outfile:
            for infile in sorted(files):
                shutil.copyfileobj(open(infile), outfile)
                log.info("Copied {} to {}".format(
                    os.path.basename(infile),
                    name
                ))
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
Exemple #19
0
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # parse the config file - allowing no values (e.g. no ":" in config file)
    config = ConfigParser.RawConfigParser(allow_no_value=True)
    config.optionxform = str
    config.read(args.taxon_list_config)
    # connect to the database
    conn = sqlite3.connect(args.locus_db)
    c = conn.cursor()
    # attach to external database, if passed as option
    if args.extend_locus_db:
        log.info("Attaching extended database {}".format(os.path.basename(args.extend_locus_db)))
        query = "ATTACH DATABASE '{0}' AS extended".format(args.extend_locus_db)
        c.execute(query)
    organisms = get_taxa_from_config(config, args.taxon_group)
    log.info("There are {} taxa in the taxon-group '[{}]' in the config file {}".format(
        len(organisms),
        args.taxon_group,
        os.path.basename(args.taxon_list_config)
    ))
    uces = get_uce_names(log, c)
    log.info("There are {} total UCE loci in the database".format(len(uces)))
    all_counts = []
    if args.optimize:
        shared_uces, organisms = sample_match_groups(args, c, organisms, uces, all_counts)
    else:
        shared_uces = dont_sample_match_groups(log, args, c, organisms, uces)
    if args.output and organisms and not args.silent:
        log.info("Writing the taxa and loci in the data matrix to {}".format(args.output))
        with open(args.output, 'w') as outf:
            outf.write("[Organisms]\n{0}\n[Loci]\n{1}\n".format(
                '\n'.join(sorted(organisms)),
                '\n'.join(sorted(shared_uces))
            ))
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    files = get_alignment_files(log, args.alignments, args.input_format)
    work = [(args, f) for f in files]
    sys.stdout.write("Running")
    sys.stdout.flush()
    if args.cores > 1:
        assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have"
        pool = multiprocessing.Pool(args.cores)
        results = pool.map(worker, work)
    else:
        results = map(worker, work)
    # flatten results
    all_taxa = set([item for sublist in results for item in sublist])
    print ""
    log.info("Taxon names in alignments: {0}".format(
        ','.join(list(all_taxa))
    ))
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # find all alignments
    files = get_alignment_files(log, args.alignments, args.input_format)
    work = [[file, args.input_format] for file in files]
    log.info("Computing summary statistics using {} cores".format(args.cores))
    if args.cores > 1:
        assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have"
        pool = multiprocessing.Pool(args.cores)
        summary = pool.map(get_stats, work)
    else:
        summary = map(get_stats, work)
    # alignments
    a_vars = get_lengths(summary)
    log_length_summary(log, len(summary), a_vars)
    # taxa
    t_vars = get_taxa(summary)
    log_taxa_summary(log, t_vars)
    # missing
    m_vars = get_percent_missing(summary)
    log_missing_summary(log, m_vars)
    # characters
    all_bases, sum_characters = total_characters(summary)
    sum_nucleotides = total_nucleotides(summary)
    log_char_summary(log, sum_characters, sum_nucleotides)
    # matrix
    percentages = get_matrix_percentages(t_vars[0])
    log_matrix_summary(log, percentages)
    # taxa dist.
    log_taxa_dist(log, args.show_taxon_counts, t_vars[0])
    # character dist
    log_character_dist(log, all_bases)
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # find all alignments
    files = get_alignment_files(log, args.alignments, args.input_format)
    # determine the minimum count of taxa needed in each alignment, given --percent
    min_count = int(math.floor(args.percent * args.taxa))
    work = [[file, args.input_format, min_count, args.output] for file in files]
    if args.cores > 1:
        assert args.cores <= multiprocessing.cpu_count(), "You've specified more cores than you have"
        pool = multiprocessing.Pool(args.cores)
        results = pool.map(copy_over_files, work)
    else:
        results = map(copy_over_files, work)
    log.info("Copied {0} alignments of {1} total containing ≥ {2} proportion of taxa (n = {3})".format(
        sum(results),
        len(results),
        args.percent,
        min_count
    ))
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
Exemple #23
0
def main():
    # get args and options
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # get the input data
    log.info("Getting input filenames and creating output directories")
    input = get_input_data(args.config, args.dir)
    # create the output directory if it does not exist
    if not os.path.isdir(args.output):
        os.makedirs(args.output)
    else:
        pass
    # make the symlink directory within the output directory
    contig_dir = os.path.join(args.output, 'contigs')
    if not os.path.isdir(contig_dir):
        os.makedirs(contig_dir)
    else:
        pass
    # Get path to trinity.  Standard name is `Trinity.pl`.
    # I usually symlink to `trinity`
    #TODO:  Change this to system "which" - this is just to flaky in certain cases
    try:
        trinity = which('trinity')[0]
    except EnvironmentError:
        trinity = which('Trinity.pl')[0]
    except:
        raise EnvironmentError("Cannot find Trinity.  Ensure it is installed and in your $PATH")
    for group in input:
        sample, dir = group
        # pretty print taxon status
        text = " Processing {} ".format(sample)
        log.info(text.center(65, "-"))
        # make a directory for sample-specific assemblies
        sample_dir = os.path.join(args.output, sample)
        os.makedirs(sample_dir)
        # determine how many files we're dealing with
        reads = get_input_files(dir, args.subfolder, log)
        # copy the read data over, combine singletons with read 1
        # and run the assembly for PE data.
        if reads.r1 and reads.r2 and reads.singleton:
            copy_read_data(reads, sample_dir, log)
            combine_read_data(reads, log)
            output = run_trinity_pe(trinity, reads, args.cores, args.min_kmer_coverage, log)
            if args.clean:
                cleanup_trinity_assembly_folder(output, log)
        # we don't need to combine singleton files here.  copy
        # the read data over and run the assembly for PE data
        elif reads.r1 and reads.r2:
            copy_read_data(reads, sample_dir, log)
            output = run_trinity_pe(trinity, reads, args.cores, args.min_kmer_coverage, log)
            if args.clean:
                cleanup_trinity_assembly_folder(output, log)
        # here, we don't have PE data, so copy the file over
        # and run the assembly for SE data
        elif reads.r1:
            copy_read_data(reads, sample_dir, log)
            output = run_trinity_se(trinity, reads, args.cores, args.min_kmer_coverage, log)
            if args.clean:
                cleanup_trinity_assembly_folder(output, log)
        # generate symlinks to assembled contigs
        generate_symlinks(contig_dir, sample, reads, log)
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # parse the config file - allowing no values (e.g. no ":" in config file)
    config = ConfigParser.RawConfigParser(allow_no_value=True)
    config.optionxform = str
    config.read(args.match_count_output)
    # connect to the database
    conn = sqlite3.connect(args.locus_db)
    c = conn.cursor()
    # attach to external database, if passed as option
    if args.extend_locus_db:
        log.info("Attaching extended database {}".format(os.path.basename(args.extend_locus_db)))
        query = "ATTACH DATABASE '{0}' AS extended".format(args.extend_locus_db)
        c.execute(query)
    organisms = get_names_from_config(config, 'Organisms')
    log.info("There are {} taxa in the match-count-config file named {}".format(
        len(organisms),
        os.path.basename(args.match_count_output)
    ))
    uces = get_names_from_config(config, 'Loci')
    if not args.incomplete_matrix:
        log.info("There are {} shared UCE loci in a COMPLETE matrix".format(len(uces)))
    else:
        log.info("There are {} UCE loci in an INCOMPLETE matrix".format(len(uces)))
    regex = re.compile("[N,n]{1,21}")
    if args.incomplete_matrix:
        incomplete_outf = open(args.incomplete_matrix, 'w')
    with open(args.output, 'w') as uce_fasta_out:
        for organism in organisms:
            text = "Getting UCE loci for {0}".format(organism)
            log.info(text.center(65, "-"))
            written = []
            # going to need to do something more generic w/ suffixes
            name = organism.replace('_', '-')
            if args.incomplete_matrix:
                if not organism.endswith('*'):
                    reads = find_file(args.contigs, name)
                    node_dict, missing = get_nodes_for_uces(c, organism, uces, extend=False, notstrict=True)
                elif args.extend_locus_contigs:
                    # remove the asterisk
                    name = name.rstrip('*')
                    reads = find_file(args.extend_locus_contigs, name)
                    node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True, notstrict=True)
            else:
                if not name.endswith('*'):
                    reads = find_file(args.contigs, name)
                    node_dict, missing = get_nodes_for_uces(c, organism, uces)
                elif name.endswith('*') and args.extend_locus_contigs:
                    # remove the asterisk
                    name = name.rstrip('*')
                    reads = find_file(args.extend_locus_contigs, name)
                    node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True)
            count = 0
            log.info("There are {} UCE loci for {}".format(len(node_dict), organism))
            log.info("Parsing and renaming contigs for {}".format(organism))
            for seq in SeqIO.parse(open(reads, 'rU'), 'fasta'):
                name = get_contig_name(seq.id).lower()
                if name in node_dict.keys():
                    seq.id = "{0}_{1} |{0}".format(node_dict[name][0], organism.rstrip('*'))
                    seq.name = ''
                    seq.description = ''
                    # deal with strandedness because aligners sometimes dont, which
                    # is annoying
                    if node_dict[name][1] == '-':
                        seq.seq = seq.seq.reverse_complement()
                    # Replace any occurrences of <21 Ns in a given sequence with
                    # blanks.  These should gap out during alignment. Also, replace
                    # leading/trailing lowercase bases from velvet assemblies.
                    # Lowercase bases indicate low coverage, and these
                    # have been problematic in downstream alignments).
                    seq, count = replace_and_remove_bases(regex, seq, count)
                    uce_fasta_out.write(seq.format('fasta'))
                    written.append(str(node_dict[name][0]))
                else:
                    pass
            if count > 0:
                log.info("Replaced <20 ambiguous bases (N) in {} contigs for {}".format(count, organism))
            if args.incomplete_matrix and missing:
                log.info("Writing missing locus information to {}".format(args.incomplete_matrix))
                incomplete_outf.write("[{0}]\n".format(organism))
                for name in missing:
                    incomplete_outf.write("{0}\n".format(name))
                    written.append(name)
            assert set(written) == set(uces), "UCE names do not match"
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    log, my_name = setup_logging(args)
    regex = re.compile(args.regex)
    if not os.path.isdir(args.output):
        os.makedirs(args.output)
    else:
        raise IOError("The directory {} already exists.  Please check and remove by hand.".format(args.output))
    uces = set(new_get_probe_name(seq.id, regex) for seq in SeqIO.parse(open(args.probes, 'rU'), 'fasta'))
    if args.dupefile:
        dupes = get_dupes(log, args.dupefile, regex)
    else:
        dupes = set()
    fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*'))
    organisms = get_organism_names_from_fasta_files(fasta_files)
    conn, c = create_probe_database(
        log,
        os.path.join(args.output, 'probe.matches.sqlite'),
        organisms,
        uces
    )
    log.info("Processing contig data")
    # open a file for duplicate writing, if we're interested
    if args.keep_duplicates is not None:
        dupefile = open(args.keep_duplicates, 'w')
    else:
        dupefile = None
    log.info("{}".format("-" * 65))
    for contig in sorted(fasta_files):
        critter = os.path.basename(contig).split('.')[0].replace('-', "_")
        output = os.path.join(
            args.output,
            os.path.splitext(os.path.basename(contig))[0] + '.lastz'
        )
        contigs = contig_count(contig)
        # align the probes to the contigs
        alignment = lastz.Align(
            contig,
            args.probes,
            args.min_coverage,
            args.min_identity,
            output
        )
        lzstdout, lztstderr = alignment.run()
        if lztstderr:
            raise EnvironmentError("lastz: {}".format(lztstderr))
        # parse the lastz results of the alignment
        matches = defaultdict(set)
        orientation = defaultdict(set)
        revmatches = defaultdict(set)
        probe_dupes = set()
        if not lztstderr:
            for lz in lastz.Reader(output):
                # get strandedness of match
                contig_name = get_contig_name(lz.name1)
                uce_name = new_get_probe_name(lz.name2, regex)
                if args.dupefile and uce_name in dupes:
                    probe_dupes.add(uce_name)
                else:
                    matches[contig_name].add(uce_name)
                    orientation[uce_name].add(lz.strand2)
                    revmatches[uce_name].add(contig_name)
        # we need to check nodes for dupe matches to the same probes
        contigs_matching_mult_uces = check_contigs_for_dupes(matches)
        uce_dupe_contigs, uce_dupe_uces = check_loci_for_dupes(revmatches)
        nodes_to_drop = contigs_matching_mult_uces.union(uce_dupe_contigs)
        # write out duplicates if requested
        if dupefile is not None:
            log.info("Writing duplicates file for {}".format(critter))
            if len(uce_dupe_uces) != 0:
                dupefile.write("[{} - probes hitting multiple contigs]\n".format(critter))
                for uce in uce_dupe_uces:
                    dupefile.write("{}:{}\n".format(uce, ', '.join(revmatches[uce])))
                dupefile.write("\n")
            if len(contigs_matching_mult_uces) != 0:
                dupefile.write("[{} - contigs hitting multiple probes]\n".format(critter))
                for dupe in contigs_matching_mult_uces:
                    dupefile.write("{}:{}\n".format(dupe, ', '.join(matches[dupe])))
                dupefile.write("\n")
        #pdb.set_trace()
        # remove dupe and/or dubious nodes/contigs
        match_copy = copy.deepcopy(matches)
        for k in match_copy.keys():
            if k in nodes_to_drop:
                del matches[k]
        store_lastz_results_in_db(c, matches, orientation, critter)
        conn.commit()
        pretty_log_output(
            log,
            critter,
            matches,
            contigs,
            probe_dupes,
            contigs_matching_mult_uces,
            uce_dupe_uces
        )
    if dupefile is not None:
        dupefile.close()
    log.info("{}".format("-" * 65))
    log.info("The LASTZ alignments are in {}".format(args.output))
    log.info("The UCE match database is in {}".format(os.path.join(args.output, "probes.matches.sqlite")))
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # parse the config file - allowing no values (e.g. no ":" in config file)
    config = ConfigParser.RawConfigParser(allow_no_value=True)
    config.optionxform = str
    config.read(args.config)
    # connect to the database
    conn = sqlite3.connect(args.locus_db)
    c = conn.cursor()
    # attach to external database, if passed as option
    organisms = get_names_from_config(config, "Organisms")
    log.info(
        "There are {} taxa in the match-count-config file named {}".format(
            len(organisms), os.path.basename(args.config)
        )
    )
    exons = get_names_from_config(config, "Loci")
    log.info("There are {} exon loci in the matrix".format(len(exons)))
    regex = re.compile("[N,n]{1,21}")
    out_dir = "/".join(args.output.split("/")[:-1])
    temp_conf = os.path.join(out_dir, "config_extended")
    incomplete_outf = open(temp_conf, "w")
    with open(args.output, "w") as exon_fasta_out:
        for organism in organisms:
            text = "Getting exon loci for {0}".format(organism)
            log.info(text.center(65, "-"))
            written = []
            # going to need to do something more generic w/ suffixes
            name = organism.replace("_", "-")
            if not organism.endswith("*"):
                reads = find_file(args.contigs, name)
                node_dict, missing = get_nodes_for_exons(c, organism, exons, extend=False, notstrict=True)
            count = 0
            log.info("There are {} exon loci for {}".format(len(node_dict), organism))
            log.info("Parsing and renaming contigs for {}".format(organism))
            for seq in SeqIO.parse(open(reads, "rU"), "fasta"):
                name = get_contig_name(seq.id).lower()
                # print "name:", name
                # print node_dict.keys()

                if name in node_dict.keys():
                    seq.id = "{0}_{1} |{0}".format(node_dict[name][0], organism.rstrip("*"))
                    seq.name = ""
                    seq.description = ""
                    # deal with strandedness because aligners sometimes dont, which
                    # is annoying
                    if node_dict[name][1] == "-":
                        seq.seq = seq.seq.reverse_complement()
                    # Replace any occurrences of <21 Ns in a given sequence with
                    # blanks.  These should gap out during alignment. Also, replace
                    # leading/trailing lowercase bases from velvet assemblies.
                    # Lowercase bases indicate low coverage, and these
                    # have been problematic in downstream alignments).
                    seq, count = replace_and_remove_bases(regex, seq, count)
                    exon_fasta_out.write(seq.format("fasta"))
                    # print "node_dict:", node_dict[name][0]
                    written.append(str(node_dict[name][0]))
                else:
                    pass
            if count > 0:
                log.info("Replaced <20 ambiguous bases (N) in {} contigs for {}".format(count, organism))
            if missing:
                log.info("Writing missing locus information to {}".format(temp_conf))
                incomplete_outf.write("[{0}]\n".format(organism))
                for name in missing:
                    incomplete_outf.write("{0}\n".format(name))
                    written.append(name)
            # print written
            # print exons
            assert set(written) == set(exons), "exon names do not match"
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
def main():
	args = get_args()
	log, my_name = setup_logging(args)
	pre_regex = args.regex
	regex = re.compile("^(%s)(?:.*)" %pre_regex)
	if not os.path.isdir(args.output):
		os.makedirs(args.output)
	else:
		raise IOError("The directory {} already exists.  Please check and remove by hand.".format(args.output))
	exons = set(new_get_probe_name(seq.id, regex) for seq in SeqIO.parse(open(args.reference, 'rU'), 'fasta'))
	#print exons
	if args.dupefile:
		dupes = get_dupes(log, args.dupefile, regex)
	else:
		dupes = set()
	fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*'))
	for f in fasta_files:
		replace_bad_fasta_chars = "sed -i -e '/>/! s=[K,Y,R,S,M,W,k,y,r,s,m,w]=N=g' %s" %f
		os.system(replace_bad_fasta_chars)
	#print fasta_files
	organisms = get_organism_names_from_fasta_files(fasta_files)
	#print organisms
	conn, c = create_probe_database(
		log,
		os.path.join(args.output, 'probe.matches.sqlite'),
		organisms,
		exons
	)
	log.info("Processing contig data")
	# open a file for duplicate writing, if we're interested
	if args.keep_duplicates is not None:
		dupefile = open(args.keep_duplicates, 'w')
	else:
		dupefile = None
	log.info("{}".format("-" * 65))
	kmers = {}
	for contig in sorted(fasta_files):
		critter = os.path.basename(contig).split('.')[0].replace('-', "_")
		output = os.path.join(
			args.output,
			os.path.splitext(os.path.basename(contig))[0] + '.lastz'
		)
		contigs = contig_count(contig)
		# align the probes to the contigs
		alignment = lastz.Align(
			contig,
			args.reference,
			args.min_coverage,
			args.min_identity,
			output
		)
		lzstdout, lztstderr = alignment.run()
		if lztstderr:
			raise EnvironmentError("lastz: {}".format(lztstderr))
		# parse the lastz results of the alignment
		matches = defaultdict(set)
		orientation = defaultdict(set)
		revmatches = defaultdict(set)
		probe_dupes = set()
		if not lztstderr:
			for lz in lastz.Reader(output):
				contig_name = get_contig_name(lz.name1)
				exon_name = new_get_probe_name(lz.name2, regex)
				if args.dupefile and exon_name in dupes:
					probe_dupes.add(exon_name)
				else:
					matches[contig_name].add(exon_name)
					orientation[exon_name].add(lz.strand2)
					revmatches[exon_name].add(contig_name)

		# we need to check nodes for dupe matches to the same probes
		contigs_matching_mult_exons = check_contigs_for_dupes(matches)
		exon_dupe_contigs, exon_dupe_exons = check_loci_for_dupes(revmatches)
		nodes_to_drop = contigs_matching_mult_exons.union(exon_dupe_contigs)
		# write out duplicates if requested
		if dupefile is not None:
			log.info("Writing duplicates file for {}".format(critter))
			if len(exon_dupe_exons) != 0:
				dupefile.write("[{} - probes hitting multiple contigs]\n".format(critter))
				for exon in exon_dupe_exons:
					dupefile.write("{}:{}\n".format(exon, ', '.join(revmatches[exon])))
				dupefile.write("\n")
			if len(contigs_matching_mult_exons) != 0:
				dupefile.write("[{} - contigs hitting multiple probes]\n".format(critter))
				for dupe in contigs_matching_mult_exons:
					dupefile.write("{}:{}\n".format(dupe, ', '.join(matches[dupe])))
				dupefile.write("\n")

		# remove dupe and/or dubious nodes/contigs
		match_copy = copy.deepcopy(matches)
		for k in match_copy.keys():
			if k in nodes_to_drop:
				del matches[k]
		#print matches
		#print lz.name1
		#get contig id
		#contig_id = re.search("^(\d*)\s\d*\s\d*.*", lz.name1).groups()[0]
		#print matches

		#added function to return the kmer count (sum of all kmers of target contigs)
		for lz in lastz.Reader(output):
			for element in matches:
				#print element, "has to match", lz[1]
				if re.search("^(\d*)\s\d*\s\d*.*", lz[1]).groups()[0] == element:
					kmer_value = get_kmer_value(lz.name1)
					kmers.setdefault(contig,[])
					kmers[contig].append(kmer_value)
		store_lastz_results_in_db(c, matches, orientation, critter)
		conn.commit()
		pretty_log_output(
			log,
			critter,
			matches,
			contigs,
			probe_dupes,
			contigs_matching_mult_exons,
			exon_dupe_exons
		)

	kmerfile = open(os.path.join(args.output,'kmer_count.txt'), 'w')

	for key in kmers:
		count = 0
		for element in kmers[key]:
			count += int(element)
		kmerfile.write("%s : %d\n" %(os.path.basename(key).split('.')[0],count))


	if dupefile is not None:
		dupefile.close()
	log.info("{}".format("-" * 65))
	log.info("The LASTZ alignments are in {}".format(args.output))
	log.info("The exon match database is in {}".format(os.path.join(args.output, "probes.matches.sqlite")))
	text = " Completed {} ".format(my_name)
	log.info(text.center(65, "="))

	# Access the SQL file and export tab-separated text-file
	sql_file = os.path.join(args.output, 'probe.matches.sqlite')
	tsf_out = os.path.join(args.output, 'match_table.txt')
	sql_cmd = "%s -header -nullvalue '.' -separator '\t' %s \"select * from matches;\" > %s" %(args.sqlite3,sql_file,tsf_out)
	os.system(sql_cmd)

	# Create the config file for the extraction of the desired loci
	output_folder = args.output
	create_conf_cmd = "echo \"[Organisms]\" > %s/config; ls %s/*.lastz | rev | cut -d/ -f1 | rev | cut -d \"_\" -f 1 >> %s/config; echo \"[Loci]\" >> %s/config; tail -n+2 %s/match_table.txt | cut -f 1 >> %s/config" %(output_folder,output_folder,output_folder,output_folder,output_folder,output_folder)
	os.system(create_conf_cmd)
	remove_lastz = "sed -i 's/.lastz//g' %s/config" %output_folder
	os.system(remove_lastz)
def main():
    # get args and options
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # get the input data
    log.info("Getting input filenames")
    input = get_input_data(args.assemblo_config, None)
    # Get path to bwa
    try:
        bwa = which('bwa')[0]
    except:
        raise EnvironmentError("Cannot find bwa.  Ensure it is installed and in your $PATH")
    # make the symlink directory within the output directory
    contig_dir = os.path.join(args.assemblies, 'contigs-trimmed')
    if not os.path.isdir(contig_dir):
        os.makedirs(contig_dir)
    else:
        pass
    for group in input:
        sample, reads = group
        # pretty print taxon status
        text = " Processing {} ".format(sample)
        log.info(text.center(65, "-"))
        # ensure that assembly exists
        assembly_pth = os.path.join(args.assemblies, sample)
        assembly = os.path.join(assembly_pth, "contigs.fasta")
        if not os.path.exists(assembly):
            raise IOError("Assembly for {} does not appear to exist.".format(sample))
        if args.clean:
            cleanup_trinity_assembly_folder(log, assembly_pth)
        # determine the types of raw read data that we have
        fastq = get_input_files(reads, args.subfolder, log)
        # create the bwa index
        bwa_create_index_files(log, assembly)
        samtools_create_faidx(log, sample, assembly_pth, assembly)
        picard_create_reference_dict(log, sample, assembly_pth, assembly)
        bam = False
        bam_se = False
        if args.bwa_mem and fastq.r1 and fastq.r2:
            bam = bwa_mem_pe_align(log, sample, assembly_pth, assembly, args.cores, fastq.r1, fastq.r2)
            bam = picard_clean_up_bam(log, sample, assembly_pth, bam, "pe")
            bam = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam, "pe")
        elif not args.bwa_mem and fastq.r1 and fastq.r2:
            bam = bwa_pe_align(log, sample, assembly_pth, assembly, args.cores, fastq.r1, fastq.r2)
            bam = picard_clean_up_bam(log, sample, assembly_pth, bam, "pe")
            bam = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam, "pe")
        # get singleton reads for alignment
        if args.bwa_mem and fastq.singleton:
            bam_se = bwa_mem_se_align(log, sample, assembly_pth, assembly, args.cores, fastq.singleton)
            bam_se = picard_clean_up_bam(log, sample, assembly_pth, bam_se, 'se')
            bam_se = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam_se, "se")
        # if we only have se reads, those will be in fastq.r1 only
        elif args.bwa_mem and not fastq.r2 and fastq.r1:
            bam_se = bwa_mem_se_align(log, sample, assembly_pth, assembly, args.cores, fastq.r1)
            bam_se = picard_clean_up_bam(log, sample, assembly_pth, bam_se, 'se')
            bam_se = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam_se, "se")
        elif not args.bwa_mem and fastq.singleton:
            bam_se = bwa_se_align(log, sample, assembly_pth, assembly, args.cores, fastq.singleton)
            bam_se = picard_clean_up_bam(log, sample, assembly_pth, bam_se, 'se')
            bam_se = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam_se, "se")
        elif not args.bwa_mem and not fastq.r2 and fastq.r1:
            bam_se = bwa_se_align(log, sample, assembly_pth, assembly, args.cores, fastq.r1)
            bam_se = picard_clean_up_bam(log, sample, assembly_pth, bam_se, 'se')
            bam_se = picard_add_rg_header_info(log, sample, assembly_pth, "Generic", bam_se, "se")
        if bam and bam_se:
            bam = picard_merge_two_bams(log, sample, assembly_pth, bam, bam_se)
        elif bam_se and not bam:
            bam = bam_se
        if not bam:
            raise IOError("There is no BAM file.  Check bwa log files for problems.")
        samtools_index(log, sample, assembly_pth, bam)
        coverage = gatk_coverage(log, sample, assembly_pth, assembly, args.cores, bam)
        overall_contigs = get_coverage_from_gatk(log, sample, assembly_pth, coverage, args.velvet)
        remove_gatk_coverage_files(log, assembly_pth, coverage)
        trimmed_fasta_path = filter_screened_contigs_from_assembly(log, sample, assembly_pth, assembly, overall_contigs)
        symlink_trimmed_contigs(log, sample, contig_dir, trimmed_fasta_path)
    # end
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))