def split(args): parser = optparse.OptionParser("""\ split a fasta file into separated files. pyfasta split -n 6 [-k 5000 ] some.fasta the output will be some.0.fasta, some.1.fasta ... some.6.fasta the sizes will be as even as reasonable. """) parser.add_option("--header", dest="header", metavar="FILENAME_FMT", help="""this overrides all other options. if specified, it will split the file into a separate file for each header. it will be a template specifying the file name for each new file. e.g.: "%(fasta)s.%(seqid)s.fasta" where 'fasta' is the basename of the input fasta file and seqid is the header of each entry in the fasta file.""" ,default=None) parser.add_option("-n", "--n", type="int", dest="nsplits", help="number of new files to create") parser.add_option("-o", "--overlap", type="int", dest="overlap", help="overlap in basepairs", default=0) parser.add_option("-k", "--kmers", type="int", dest="kmers", default=-1, help="""\ split big files into pieces of this size in basepairs. default default of -1 means do not split the sequence up into k-mers, just split based on the headers. a reasonable value would be 10Kbp""") options, fasta = parser.parse_args(args) if not (fasta and (options.nsplits or options.header)): sys.exit(parser.print_help()) if isinstance(fasta, (tuple, list)): assert len(fasta) == 1, fasta fasta = fasta[0] kmer = options.kmers if options.kmers != -1 else None overlap = options.overlap if options.overlap != 0 else None f = Fasta(fasta) if options.header: names = dict([(seqid, options.header % \ dict(fasta=f.fasta_name, seqid=seqid)) \ for seqid in f.iterkeys()]) """ if len(names) > 0: assert names[0][1] != names[1][1], ("problem with header format", options.header) fhs = dict([(seqid, open(fn, 'wb')) for seqid, fn in names[:200]]) fhs.extend([(seqid, StringIO(), fn) for seqid, fn in names[200:]]) """ return with_header_names(f, names) else: names = newnames(fasta, options.nsplits, kmers=kmer, overlap=overlap, header=options.header) #fhs = [open(n, 'wb') for n in names] if options.kmers == -1: return without_kmers(f, names) else: return with_kmers(f, names, options.kmers, options.overlap)
def write_c2t(fasta_name, unconverted, colorspace=False): """ given a fasta file, write a new file: `some.fr.c2t.fasta` which contains: + the same headers prefixed with 'f' with all C's converted to T + headers prefixed with 'r' reverse complemented with all C's converted to T. if unconverted is false, then also save a file with the forward and reverse without conversion. """ d = op.join(op.dirname(fasta_name), "bowtie_index") if colorspace: d += "_colorspace" if not op.exists(d): os.mkdir(d) p, ext = op.splitext(op.basename(fasta_name)) # some.fasta -> some, fasta fname = "%s/%s.fr.c2t%s" % (d, p, ext) # no conversion, just copy the file into the index dir. unconverted_fname = "%s/%s.fr%s" % (d, p, ext) if op.exists(fname): if not unconverted: return fname, unconverted_fname elif op.exists(unconverted_fname): return fname, unconverted_fname fasta = Fasta(fasta_name) c2t_fh = open(fname, 'w') unc_fh = open(unconverted_fname, 'w') if unconverted else None print >>sys.stderr, "writing forward and reverse c2t to: %s" % (fname,) try: for header in fasta.iterkeys(): seq = str(fasta[header]).upper() assert not ">" in seq # c2t, prefix header with f and write print >>c2t_fh, ">f%s" % header print >>c2t_fh, seq.replace('C', 'T') # then r-c, c2t, prefix header with r and write print >>c2t_fh, ">r%s" % header rseq = revcomp(seq) print >>c2t_fh, rseq.replace('C', 'T') if unc_fh is not None: print >>unc_fh, ">f%s\n%s" % (header, seq) print >>unc_fh, ">r%s\n%s" % (header, rseq) c2t_fh.close() except: os.unlink(fname) os.unlink(unconverted_fname) raise return fname, unconverted_fname
def write_c2t(fasta_name, unconverted, colorspace=False): """ given a fasta file, write a new file: `some.fr.c2t.fasta` which contains: + the same headers prefixed with 'f' with all C's converted to T + headers prefixed with 'r' reverse complemented with all C's converted to T. if unconverted is false, then also save a file with the forward and reverse without conversion. """ d = op.join(op.dirname(fasta_name), "bowtie_index") if colorspace: d += "_colorspace" if not op.exists(d): os.mkdir(d) p, ext = op.splitext(op.basename(fasta_name)) # some.fasta -> some, fasta fname = "%s/%s.fr.c2t%s" % (d, p, ext) # no conversion, just copy the file into the index dir. unconverted_fname = "%s/%s.fr%s" % (d, p, ext) if op.exists(fname): if not unconverted: return fname, unconverted_fname elif op.exists(unconverted_fname): return fname, unconverted_fname fasta = Fasta(fasta_name) c2t_fh = open(fname, 'w') unc_fh = open(unconverted_fname, 'w') if unconverted else None print >> sys.stderr, "writing forward and reverse c2t to: %s" % (fname, ) try: for header in fasta.iterkeys(): seq = str(fasta[header]).upper() assert not ">" in seq # c2t, prefix header with f and write print >> c2t_fh, ">f%s" % header print >> c2t_fh, seq.replace('C', 'T') # then r-c, c2t, prefix header with r and write print >> c2t_fh, ">r%s" % header rseq = revcomp(seq) print >> c2t_fh, rseq.replace('C', 'T') if unc_fh is not None: print >> unc_fh, ">f%s\n%s" % (header, seq) print >> unc_fh, ">r%s\n%s" % (header, rseq) c2t_fh.close() except: os.unlink(fname) os.unlink(unconverted_fname) raise return fname, unconverted_fname
def mask(fasta_file, org, cutoff, mask_value='X'): h5, node = get_node(org, 'r') outfile = fasta_file[:fasta_file.rfind(".")] + (".masked.%i" % cutoff) \ + fasta_file[fasta_file.rfind("."):] print "> masking sequence to file:", outfile out = open(outfile ,'w') fasta = Fasta(fasta_file) soft_mask = mask_value.lower() == 'soft' for seqid in sorted(fasta.iterkeys()): masked = 0 if soft_mask: seq = str(fasta[seqid]) # mask is the lowercase sequence. mask_value = np.array(seq.lower(), dtype='c') seq = np.array(seq.upper(), dtype='c') else: fasta[seqid].tostring = False seq = fasta[seqid][:] # a if not 'c' + seqid in node: print >>sys.stderr, seqid,\ '! not found in masked, writing unchanged\n' \ ' this means that no section of this sequence appeared\n' \ ' more than %i times' % cutoff out.write('>' + seqid + '\n') out.write(seq.tostring() + '\n') continue hit_counts = getattr(node, 'c' + seqid)[:] masked_seq = np.where(numexpr.evaluate("hit_counts > %i" % cutoff) , mask_value, seq).tostring() l = len(masked_seq) print >>sys.stderr, "! seq:%s len:%i %%masked:%.3f" % (seqid, l, 100.0 * masked_seq.count(mask_value) / l) assert len(seq) == l out.write('>' + seqid + '\n') out.write(masked_seq + '\n') out.close() # write out a file .fasta.version containing # the svnversion (if available of this script # that was used to create the file. path = os.path.dirname(__file__) os.system('svnversion %s > %s.version' % (path, outfile)) h5.close()
def mask(fasta_file, org, cutoff, mask_value='X'): h5, node = get_node(org, 'r') outfile = fasta_file[:fasta_file.rfind(".")] + (".masked.%i" % cutoff) \ + fasta_file[fasta_file.rfind("."):] print "> masking sequence to file:", outfile out = open(outfile, 'w') fasta = Fasta(fasta_file) soft_mask = mask_value.lower() == 'soft' for seqid in sorted(fasta.iterkeys()): masked = 0 if soft_mask: seq = str(fasta[seqid]) # mask is the lowercase sequence. mask_value = np.array(seq.lower(), dtype='c') seq = np.array(seq.upper(), dtype='c') else: fasta[seqid].tostring = False seq = fasta[seqid][:] # a if not 'c' + seqid in node: print >>sys.stderr, seqid,\ '! not found in masked, writing unchanged\n' \ ' this means that no section of this sequence appeared\n' \ ' more than %i times' % cutoff out.write('>' + seqid + '\n') out.write(seq.tostring() + '\n') continue hit_counts = getattr(node, 'c' + seqid)[:] masked_seq = np.where(numexpr.evaluate("hit_counts > %i" % cutoff), mask_value, seq).tostring() l = len(masked_seq) print >> sys.stderr, "! seq:%s len:%i %%masked:%.3f" % ( seqid, l, 100.0 * masked_seq.count(mask_value) / l) assert len(seq) == l out.write('>' + seqid + '\n') out.write(masked_seq + '\n') out.close() # write out a file .fasta.version containing # the svnversion (if available of this script # that was used to create the file. path = os.path.dirname(__file__) os.system('svnversion %s > %s.version' % (path, outfile)) h5.close()
chh_idxs, = np.where(~(first_g | second_g)) chh = c_idxs[chh_idxs] methyl[chh] = CHH + adder return methyl def write_file(seq, file_path): methyl = calc_methylation(str(seq)) methyl.tofile(file_path) if __name__ == "__main__": import optparse import sys import doctest import os doctest.testmod() p = optparse.OptionParser(__doc__) opts, args = p.parse_args() if not (len(args) and args[0]): sys.exit(p.print_help()) from pyfasta import Fasta path = args[0] f = Fasta(path) for k in f.iterkeys(): print "calculating methylation data for seqid: %s" % k binpath = "%s.%s.methyl.type.bin" % (path, k) write_file(f[k], binpath) print "saved to %s" % binpath
methyl[chh] = CHH + adder return methyl def write_file(seq, file_path): methyl = calc_methylation(str(seq)) methyl.tofile(file_path) if __name__ == "__main__": import optparse import sys import doctest import os doctest.testmod() import sys sys.exit() p = optparse.OptionParser(__doc__) opts, args = p.parse_args() if not (len(args) and args[0]): sys.exit(p.print_help()) from pyfasta import Fasta path = args[0] f = Fasta(path) for k in f.iterkeys(): print "calculating methylation data for seqid: %s" % k binpath = "%s.%s.methyl.type.bin" % (path, k) write_file(f[k], binpath) print "saved to %s" % binpath
subject_fasta_file = sys.argv[2] out_fasta = "%s.features%s" % op.splitext(subject_fasta_file) by_subject = collections.defaultdict(list) fa = Fasta(subject_fasta_file) def pairwise(iterable): "s -> (s0,s1), (s1,s2), (s2, s3), ..." a, b = tee(iterable) next(b, None) return izip(a, b) by_subject = {} seqids = [] for seqid in fa.iterkeys(): by_subject[seqid] = np.zeros((len(fa[seqid]) + 1,), dtype=np.uint8) seqids.append((len(fa[seqid]),seqid)) by_query_subject = collections.defaultdict(dict) for line in open(blast_file): args = line.split("\t") query, subject = args[0], args[1] sstart, sstop = sorted(map(int, args[8:10])) by_subject[subject][sstart: sstop + 1] |= 1 if not subject in by_query_subject[query]: by_query_subject[query][subject] = [(sstart, sstop)] else: by_query_subject[query][subject].append((sstart, sstop))
def parse_gsnap_sam(gsnap_f, ref_path, out_dir, paired_end): fa = Fasta(ref_path) fc, ft, fmethyltype = \ bin_paths_from_fasta(fa.fasta_name, out_dir) counts = get_counts(fc, ft, fa) chr_lengths = dict((k, len(fa[k])) for k in fa.iterkeys()) print >>sys.stderr, "tabulating methylation" gsnap_subset = open(gsnap_f.replace(".gsnap.sam", ".sam"), "w") for sline in open(gsnap_f): if sline.startswith("@SQ"): print >>gsnap_subset, sline.strip() continue # the ends didn't map to same spot. line = sline.split("\t") sam_flag = int(line[1]) if paired_end: if line[6] != "=": continue #print >>gsnap_subset, sline.strip() else: # no reported alignments. if sam_flag == 4: continue print >>gsnap_subset, sline.rstrip("\n") seqid = line[2] aln_seq = line[9] read_length = len(aln_seq) bp0 = int(line[3]) - 1 ga = ((sam_flag & 16) != 0) ^ (sam_flag & 128 != 0) insert_length = int(line[8]) #line[9] = aln_seq #line[10] = line[10][:len(aln_seq)] # both ends start at exactly the same place. if insert_length == 0: continue # handle overlapping reads. one side has + insert, the other is - if -read_length < insert_length < 0: insert_length = abs(insert_length) aln_seq = aln_seq[:-(read_length - insert_length)] read_length = len(aln_seq) if line[7] == '0': continue bp1 = bp0 + read_length ref_seq = (fa[seqid][bp0:bp1]).upper() letters = 'GA' if ga else 'CT' read_length = len(ref_seq) assert read_length > 0, (bp0, bp1) _update_conversions(ref_seq, aln_seq, bp0, letters, counts[seqid]['c'], counts[seqid]['t'], 50, read_length, line[5]) write_files(fa.fasta_name, out_dir, counts) cmd = open(out_dir +"/cmd.ran", "w") import datetime print >>cmd, "#date:", str(datetime.date.today()) print >>cmd, "#path:", op.abspath(".") print >>cmd, " ".join(sys.argv) write_sam_commands(out_dir, fa, "methylcoder.gsnap")
def count_conversions(original_fasta, sam_iter, raw_reads_list, c2t_reads_list, index_class, out_dir, allowed_mismatches, mode='w', counts=None, is_colorspace=False): print >>sys.stderr, "tabulating methylation for %s" % (original_fasta,) fh_raw_reads_list = [open(raw_reads) for raw_reads in raw_reads_list] fh_c2t_reads_list = [open(c2t_reads) for c2t_reads in c2t_reads_list] fqidx_list = [index_class(c2t_reads) for c2t_reads in c2t_reads_list] fa = Fasta(original_fasta) #fh_raw_reads = open(raw_reads, 'r') #fh_c2t_reads = open(c2t_reads, 'r') def get_records(header, idx): return get_raw_and_c2t(header, fqidx_list[idx], fh_raw_reads_list[idx], fh_c2t_reads_list[idx]) chr_lengths = dict((k, len(fa[k])) for k in fa.iterkeys()) out_sam = open(out_dir + "/methylcoded.sam", mode) # get the 3 possible binary files for each chr fc, ft, fmethyltype = \ bin_paths_from_fasta(original_fasta, out_dir) # if we're re-running without conversion, it was passed via kwargs. # otherwise, we get a new set. retry = counts is not None unmapped_name = op.join(out_dir, "unmapped.reads") if retry: unmapped_name += ".try2" else: counts = get_counts(fc, ft, fa) skipped = 0 align_count = 0 for p, sam_line, read_len, direction in parse_sam(sam_iter, chr_lengths, get_records, unmapped_name, is_colorspace, out_sam): pairs = "CT" if direction == "f" else "GA" # read_id = p['read_id'] pos0 = p['pos0'] align_count += 1 raw = p['raw_read'] sam_flag = int(sam_line[1]) # paired and first of read and ends have overlap. # this section makes sure that the overlap is only counted 1x. if sam_line[7] != '0' and 0 <= int(sam_line[8]) < read_len: offset = int(sam_line[8]) pos0 = pos0 + read_len - offset read_len = offset raw = raw[-offset:] # chop the read info to the non-overlapping bases. p['read_sequence'] = p['read_sequence'][-offset:] # the position is the line num * the read_id + read_id (where the + # is to account for the newlines. genomic_ref = str(fa[p['seqid']][pos0:pos0 + read_len]).upper() DEBUG = False if DEBUG: print >>sys.stderr, "=" * 80 print >>sys.stderr, sam_line print >>sys.stderr, "=" * 80 print >>sys.stderr, ">>", sam_line[1], sam_flag idx = 0 if (sam_flag & 128) == 0 else 1 araw, ac2t = get_records(read_id, idx) #araw.seq = araw.seq[-offset:] #c2t.seq = ac2t.seq[-offset:] print >>sys.stderr, "f['%s'][%i:%i + %i]" % (p['seqid'], pos0, pos0, read_len) print >>sys.stderr, "mismatches:", p['nmiss'] print >>sys.stderr, "ref :", genomic_ref if direction == 'r': print >>sys.stderr, "raw_read(r):", raw c2t = ac2t.seq if not is_colorspace: if (sam_flag & 128) == 0: c2t = revcomp(c2t)[-offset:] assert c2t == p['read_sequence'], (c2t, p['read_sequence']) else: print >>sys.stderr, "raw_read(f):", raw c2t = ac2t.seq if not is_colorspace: if (sam_flag & 128) != 0: c2t = revcomp(c2t) p['read_sequence'] = revcomp(p['read_sequence']) assert c2t == p['read_sequence'], (c2t, p['read_sequence'], sam_flag) print >>sys.stderr, "c2t :", c2t, "\n" # have to keep the ref in forward here to get the correct bp # positions. look for CT when forward and GA when back. current_mismatches = p['nmiss'] # we send in the current mismatches and allowed mismatches so that in # the case where a C in the ref seq has becomes a T in the align seq # (but wasnt calc'ed as a mismatch because we had converted C=>T. if # these errors cause the number of mismatches to exceed the number of # allowed mismatches, we dont include the stats for this read. # if we're retrying, the reads are treated, but the genome is not ct-converted, so # dont check the mismtaches since it was done correctly by bowtie. remaining_mismatches = -1 if retry else (allowed_mismatches - current_mismatches) this_skipped = _update_conversions(genomic_ref, raw, pos0, pairs, counts[p['seqid']]['c'], counts[p['seqid']]['t'], remaining_mismatches, read_len, sam_line[5]) if this_skipped == 0: # only print the line to the sam file if we use it in our calcs. print >>out_sam, "\t".join(sam_line) skipped += this_skipped if DEBUG: raw_input("press any key...\n") print >>sys.stderr, "total alignments: %i" % align_count print >>sys.stderr, \ "skipped %i alignments (%.3f%%) where read T mapped to genomic C" % \ (skipped, 100.0 * skipped / (align_count or 1)) return counts, unmapped_name
def count_conversions(original_fasta, sam_iter, raw_reads_list, c2t_reads_list, index_class, out_dir, allowed_mismatches, mode='w', counts=None, is_colorspace=False): print >> sys.stderr, "tabulating methylation for %s" % (original_fasta, ) fh_raw_reads_list = [open(raw_reads) for raw_reads in raw_reads_list] fh_c2t_reads_list = [open(c2t_reads) for c2t_reads in c2t_reads_list] fqidx_list = [index_class(c2t_reads) for c2t_reads in c2t_reads_list] fa = Fasta(original_fasta) #fh_raw_reads = open(raw_reads, 'r') #fh_c2t_reads = open(c2t_reads, 'r') def get_records(header, idx): return get_raw_and_c2t(header, fqidx_list[idx], fh_raw_reads_list[idx], fh_c2t_reads_list[idx]) chr_lengths = dict((k, len(fa[k])) for k in fa.iterkeys()) out_sam = open(out_dir + "/methylcoded.sam", mode) # get the 3 possible binary files for each chr fc, ft, fmethyltype = \ bin_paths_from_fasta(original_fasta, out_dir) # if we're re-running without conversion, it was passed via kwargs. # otherwise, we get a new set. retry = counts is not None unmapped_name = op.join(out_dir, "unmapped.reads") if retry: unmapped_name += ".try2" else: counts = get_counts(fc, ft, fa) skipped = 0 align_count = 0 for p, sam_line, read_len, direction in parse_sam(sam_iter, chr_lengths, get_records, unmapped_name, is_colorspace, out_sam): pairs = "CT" if direction == "f" else "GA" # read_id = p['read_id'] pos0 = p['pos0'] align_count += 1 raw = p['raw_read'] sam_flag = int(sam_line[1]) # paired and first of read and ends have overlap. # this section makes sure that the overlap is only counted 1x. if sam_line[7] != '0' and 0 <= int(sam_line[8]) < read_len: offset = int(sam_line[8]) pos0 = pos0 + read_len - offset read_len = offset raw = raw[-offset:] # chop the read info to the non-overlapping bases. p['read_sequence'] = p['read_sequence'][-offset:] # the position is the line num * the read_id + read_id (where the + # is to account for the newlines. genomic_ref = str(fa[p['seqid']][pos0:pos0 + read_len]).upper() DEBUG = False if DEBUG: print >> sys.stderr, "=" * 80 print >> sys.stderr, sam_line print >> sys.stderr, "=" * 80 print >> sys.stderr, ">>", sam_line[1], sam_flag idx = 0 if (sam_flag & 128) == 0 else 1 araw, ac2t = get_records(read_id, idx) #araw.seq = araw.seq[-offset:] #c2t.seq = ac2t.seq[-offset:] print >> sys.stderr, "f['%s'][%i:%i + %i]" % (p['seqid'], pos0, pos0, read_len) print >> sys.stderr, "mismatches:", p['nmiss'] print >> sys.stderr, "ref :", genomic_ref if direction == 'r': print >> sys.stderr, "raw_read(r):", raw c2t = ac2t.seq if not is_colorspace: if (sam_flag & 128) == 0: c2t = revcomp(c2t)[-offset:] assert c2t == p['read_sequence'], (c2t, p['read_sequence']) else: print >> sys.stderr, "raw_read(f):", raw c2t = ac2t.seq if not is_colorspace: if (sam_flag & 128) != 0: c2t = revcomp(c2t) p['read_sequence'] = revcomp(p['read_sequence']) assert c2t == p['read_sequence'], (c2t, p['read_sequence'], sam_flag) print >> sys.stderr, "c2t :", c2t, "\n" # have to keep the ref in forward here to get the correct bp # positions. look for CT when forward and GA when back. current_mismatches = p['nmiss'] # we send in the current mismatches and allowed mismatches so that in # the case where a C in the ref seq has becomes a T in the align seq # (but wasnt calc'ed as a mismatch because we had converted C=>T. if # these errors cause the number of mismatches to exceed the number of # allowed mismatches, we dont include the stats for this read. # if we're retrying, the reads are treated, but the genome is not ct-converted, so # dont check the mismtaches since it was done correctly by bowtie. remaining_mismatches = -1 if retry else (allowed_mismatches - current_mismatches) this_skipped = _update_conversions(genomic_ref, raw, pos0, pairs, counts[p['seqid']]['c'], counts[p['seqid']]['t'], remaining_mismatches, read_len, sam_line[5]) if this_skipped == 0: # only print the line to the sam file if we use it in our calcs. print >> out_sam, "\t".join(sam_line) skipped += this_skipped if DEBUG: raw_input("press any key...\n") print >> sys.stderr, "total alignments: %i" % align_count print >>sys.stderr, \ "skipped %i alignments (%.3f%%) where read T mapped to genomic C" % \ (skipped, 100.0 * skipped / (align_count or 1)) return counts, unmapped_name
def split(args): parser = optparse.OptionParser("""\ split a fasta file into separated files. pyfasta split -n 6 [-k 5000 ] some.fasta the output will be some.0.fasta, some.1.fasta ... some.6.fasta the sizes will be as even as reasonable. """) parser.add_option( "--header", dest="header", metavar="FILENAME_FMT", help="""this overrides all other options. if specified, it will split the file into a separate file for each header. it will be a template specifying the file name for each new file. e.g.: "%(fasta)s.%(seqid)s.fasta" where 'fasta' is the basename of the input fasta file and seqid is the header of each entry in the fasta file.""", default=None) parser.add_option("-n", "--n", type="int", dest="nsplits", help="number of new files to create") parser.add_option("-o", "--overlap", type="int", dest="overlap", help="overlap in basepairs", default=0) parser.add_option("-k", "--kmers", type="int", dest="kmers", default=-1, help="""\ split big files into pieces of this size in basepairs. default default of -1 means do not split the sequence up into k-mers, just split based on the headers. a reasonable value would be 10Kbp""") options, fasta = parser.parse_args(args) if not (fasta and (options.nsplits or options.header)): sys.exit(parser.print_help()) if isinstance(fasta, (tuple, list)): assert len(fasta) == 1, fasta fasta = fasta[0] kmer = options.kmers if options.kmers != -1 else None overlap = options.overlap if options.overlap != 0 else None f = Fasta(fasta) if options.header: names = dict([(seqid, options.header % \ dict(fasta=f.fasta_name, seqid=seqid)) \ for seqid in f.iterkeys()]) """ if len(names) > 0: assert names[0][1] != names[1][1], ("problem with header format", options.header) fhs = dict([(seqid, open(fn, 'wb')) for seqid, fn in names[:200]]) fhs.extend([(seqid, StringIO(), fn) for seqid, fn in names[200:]]) """ return with_header_names(f, names) else: names = newnames(fasta, options.nsplits, kmers=kmer, overlap=overlap, header=options.header) #fhs = [open(n, 'wb') for n in names] if options.kmers == -1: return without_kmers(f, names) else: return with_kmers(f, names, options.kmers, options.overlap)