def dump_results(outfile, fastadir, contigs):
	for r, d in contigs.iteritems():
		with open(os.path.join(fastadir, "{}.fa".format(r)), "w") as outfasta:
			for s,seq in d["seqs"].iteritems():
				outfile.write( "{}:{}\t{}\t{}\n".format( "\t".join(str(d["region"]).strip().split()[0:5]), s, len(seq), "*" ) )
				outfasta.write(">{}\n".format(s))
				for km in kmers.kmerize(seq, 50, 50):
					outfasta.write("{}\n".format(km))
			ref += str(kmers[e].name)[kstep:]

			## step through BWTs trying to assemble between flanking k-mers
			for bb, b in enumerate(bwts):
				alt, n = assembly.build_bridge(b, str(kmers[s-1].name), str(kmers[e].name))
				# check for successful assembly
				if not len(alt):
					# unsuccessful assembly
					logger.info("Couldn't bridge range {}:{}-{}".format(kmers[s-1].chrom, kmers[s-1].start+1, kmers[e].end))
				else:
					# assembly successful; align to reference
					ref_aln, alt_aln, ops = swalign.align(ref, alt[0])

					# check coverage of alt alleles
					alt_counts = []
					for k in ktools.kmerize(alt[0], ksize, kstep):
						if len(k) == ksize:
							alt_counts.append( util.count_reads(b, k) )
					support = np.median(alt_counts)
					#print(ref_aln)
					#print(ops)
					#print(alt_aln)
					# iterate on variant sites in alignment
					for offset, site_ref, site_alt in swalign.reconcile(ref_aln, alt_aln):
						# generate VCF entry
						CHROM = kmers[s-1].chrom
						POS = kmers[s-1].start+offset+1 #NB: must convert to 1-based
						ID = "."
						REF = site_ref
						ALT = site_alt
						QUAL = "."
Exemple #3
0
parser.add_argument(	"-f","--fasta", type = argparse.FileType("rU"),
						help = "fasta file containing sequences to search with" )
parser.add_argument(	"-M", "--msbwt", type = io.readable_dir, nargs = "+",
						help = "one or more msBWTs in which to count k-mers" )
parser.add_argument(	"-k", "--kmer", type = int,
						default = 0,
						help = "k-mer size (set to 0 to use all of each sequence) [default: %(default)d]")
parser.add_argument(	"--normalize", action = "store_true",
						help = "normalize counts against total size of msBWTs [default: %(default)d]")
args = parser.parse_args()

bwts = util.BwtSet(args.msbwt)
fa = SeqIO.parse(args.fasta, format = "fasta")

sys.stderr.write( "Using the following msBWTs:\n{}".format(str(bwts)) )
sys.stderr.write( "Reporting counts as parts per billion.\n" )
if args.kmer > 0:
	sys.stderr.write( "Breaking sequences into k-mers of length {} for searches.\n".format(args.kmer) )
else:
	sys.stderr.write( "Searching with provided sequences as-is.\n" )

for seq in fa:

	if args.kmer > 0:
		k = args.kmer
	else:
		k = len(seq.seq)
	for subseq in kmers.kmerize(seq.seq, k):
		for bwtname,count in bwts.count(subseq, args.normalize).iteritems():
			print seq.name, subseq, bwtname, count
	all_found = True
	these_contigs = collections.defaultdict(str)
	for i in range(0, len(bwtnames)):

		sys.stderr.write("\t{} ...\n".format(bwtnames[i]))

		## first check that seed will work in this BWT
		## allow step-down of seed size to get started, but hold k-mer size for assembly constant
		seed_found = False
		k_start = args.kmer + 1
		while not seed_found:
			k_start -= 1
			## if seed is less than 21nt long, it's probably hopeless
			if k_start < 21:
				break
			for seed_k in kmers.kmerize(seed, k_start, 1):
				x = util.count_reads(msbwts[i], seed_k)
				if x >= args.minweight:
					seed_found = True
					seed = seed_k
					sys.stderr.write("\t\tseed accepted ({} bp): {}\n".format(len(seed_k), seed_k))
					break

		## seed not found in this sample: break loop
		if not seed_found:
			all_found = False
			break

		## do assembly
		seq = assembly.greedy_assemble( args.msbwt[i], seed = seed, k = args.kmer, count_k = args.count_kmer, direction = True,
					min_weight = args.minweight, max_weight = args.maxweight,
	break_kmers.append( (kmer_stash[i_start].name, kmer_stash[i_end].name) )
	break_counts.append( (kmer_stash[i_start].score, kmer_stash[i_end].score) )

for i in range(0, len(break_coords)):
	hap = assemble_inward(msbwt[0], break_kmers[i][0], break_kmers[i][1])
	if args.verbose:
		outline = [ break_coords[i][0], break_coords[i][1], break_counts[i][0], break_coords[i][2], break_counts[i][1] ]
		sys.stderr.write("\t".join([ str(x) for x in outline ]) + "\n")
		sys.stderr.write("\t" + str(hap) + "\n\n")
	for h in hap:
		hap_avg = {}
		for (seq, count) in h.iteritems():
			if count > args.maf:
				ungapped_seq = dna.ungap(seq)
				k_counts = []
				for k in kmers.kmerize(dna.ungap(seq), kmer_size):
					if dna.complexity(k) > args.complexity:
						k_counts.append( util.count_reads(msbwt[0], k) )
				if len(k_counts):
					hap_avg.update({ ungapped_seq: max(k_counts) })

		flag = "*"
		if len(hap_avg.keys()) > 1:
			sys.stderr.write("Warning: apparently there is >1 haplotype at this variant site.\n")
			flag = "+"
		for (hh, hc) in hap_avg.iteritems():
			if hc > args.maxhits:
				print break_coords[i][0], break_coords[i][1], break_coords[i][2], hh, hc, flag
				sys.stdout.flush() # force write; this lets me peek at output in almost-real time

if args.verbose:
Exemple #6
0
for i in range(0, len(break_coords)):
    hap = assemble_inward(msbwt[0], break_kmers[i][0], break_kmers[i][1])
    if args.verbose:
        outline = [
            break_coords[i][0], break_coords[i][1], break_counts[i][0],
            break_coords[i][2], break_counts[i][1]
        ]
        sys.stderr.write("\t".join([str(x) for x in outline]) + "\n")
        sys.stderr.write("\t" + str(hap) + "\n\n")
    for h in hap:
        hap_avg = {}
        for (seq, count) in h.iteritems():
            if count > args.maf:
                ungapped_seq = dna.ungap(seq)
                k_counts = []
                for k in kmers.kmerize(dna.ungap(seq), kmer_size):
                    if dna.complexity(k) > args.complexity:
                        k_counts.append(util.count_reads(msbwt[0], k))
                if len(k_counts):
                    hap_avg.update({ungapped_seq: max(k_counts)})

        flag = "*"
        if len(hap_avg.keys()) > 1:
            sys.stderr.write(
                "Warning: apparently there is >1 haplotype at this variant site.\n"
            )
            flag = "+"
        for (hh, hc) in hap_avg.iteritems():
            if hc > args.maxhits:
                print break_coords[i][0], break_coords[i][1], break_coords[i][
                    2], hh, hc, flag
Exemple #7
0
    save_graph = True

bwtname = re.sub(r"/+$", "", args.msbwt).split("/").pop()

## make sure seed sequences have proper form
seed = dna.ungap(args.seed.upper())
sys.stderr.write("Seed sequence is: {}\n".format(seed))
end_seeds = []
if args.end_seeds:
    for s in args.end_seeds:
        end_seeds.append(dna.ungap(s))

## examine the seed, taking the first k-mer which returns a result
msbwts = util.load_bwts([args.msbwt])
seed_found = False
for seed_k in kmers.kmerize(seed, args.kmer, 1):
    x = util.count_reads(msbwts[0], seed_k)
    if x >= args.minweight:
        seed_found = True
        seed = seed_k
        print seed_k
        break

if not seed_found:
    sys.exit(
        "Seed sequence doesn't contain any k-mers which meet the specified abundance threshold."
    )

## do assembly
seq = assembly.greedy_assemble(args.msbwt,
                               seed=seed,
	save_graph = True

bwtname = re.sub(r"/+$","", args.msbwt).split("/").pop()

## make sure seed sequences have proper form
seed = dna.ungap(args.seed.upper())
sys.stderr.write("Seed sequence is: {}\n".format(seed))
end_seeds = []
if args.end_seeds:
	for s in args.end_seeds:
		end_seeds.append( dna.ungap(s) )

## examine the seed, taking the first k-mer which returns a result
msbwts = util.load_bwts([args.msbwt])
seed_found = False
for seed_k in kmers.kmerize(seed, args.kmer, 1):
	x = util.count_reads(msbwts[0], seed_k)
	if x >= args.minweight:
		seed_found = True
		seed = seed_k
		print seed_k
		break

if not seed_found:
	sys.exit("Seed sequence doesn't contain any k-mers which meet the specified abundance threshold.")

## do assembly
seq = assembly.greedy_assemble( args.msbwt, seed = seed, end_seeds = end_seeds, k = args.kmer, count_k = args.count_kmer, direction = args.backward,
				min_weight = args.minweight, max_weight = args.maxweight,
				max_nodes = args.maxnodes, max_length = args.maxlength,
				save = save_graph, outprefix = args.prefix, memmap = args.memmap )
            for bb, b in enumerate(bwts):
                alt, n = assembly.build_bridge(b, str(kmers[s - 1].name),
                                               str(kmers[e].name))
                # check for successful assembly
                if not len(alt):
                    # unsuccessful assembly
                    logger.info("Couldn't bridge range {}:{}-{}".format(
                        kmers[s - 1].chrom, kmers[s - 1].start + 1,
                        kmers[e].end))
                else:
                    # assembly successful; align to reference
                    ref_aln, alt_aln, ops = swalign.align(ref, alt[0])

                    # check coverage of alt alleles
                    alt_counts = []
                    for k in ktools.kmerize(alt[0], ksize, kstep):
                        if len(k) == ksize:
                            alt_counts.append(util.count_reads(b, k))
                    support = np.median(alt_counts)
                    #print(ref_aln)
                    #print(ops)
                    #print(alt_aln)
                    # iterate on variant sites in alignment
                    for offset, site_ref, site_alt in swalign.reconcile(
                            ref_aln, alt_aln):
                        # generate VCF entry
                        CHROM = kmers[s - 1].chrom
                        POS = kmers[
                            s -
                            1].start + offset + 1  #NB: must convert to 1-based
                        ID = "."