def test_extract_paired_reads_1_fa(): # test input file infile = utils.get_test_data("paired-mixed.fa") ex_outfile1 = utils.get_test_data("paired-mixed.fa.pe") ex_outfile2 = utils.get_test_data("paired-mixed.fa.se") # actual output files... outfile1 = utils.get_temp_filename("paired-mixed.fa.pe") in_dir = os.path.dirname(outfile1) outfile2 = utils.get_temp_filename("paired-mixed.fa.se", in_dir) script = scriptpath("extract-paired-reads.py") args = [infile] runscript(script, args, in_dir) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0
def test_filter_abund_single_8_retain_Ns(): # check that filter-abund-single retains # sequences with Ns, and treats them as As. infile = utils.get_temp_filename('test.fq') in_dir = os.path.dirname(infile) # copy test file over to test.fq & load into countgraph shutil.copyfile(utils.get_test_data('test-filter-abund-Ns.fq'), infile) script = 'filter-abund-single.py' args = ['-k', '17', '-x', '1e7', '-N', '2', '-C', '3', infile] utils.runscript(script, args, in_dir) outfile = infile + '.abundfilt' assert os.path.exists(outfile), outfile # test for a sequence with an 'N' in it -- names = set([r.name for r in screed.open(outfile)]) assert '895:1:37:17593:9954 1::FOO_withN' in names, names # check to see if that 'N' was properly changed to an 'A' seqs = set([r.sequence for r in screed.open(outfile)]) assert 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAG' not in seqs, seqs # ...and that an 'N' remains in the output sequences found_N = False for s in seqs: if 'N' in s: found_N = True assert found_N, seqs
def test_extract_paired_reads_2_fq(): # test input file infile = utils.get_test_data('paired-mixed.fq') ex_outfile1 = utils.get_test_data('paired-mixed.fq.pe') ex_outfile2 = utils.get_test_data('paired-mixed.fq.se') # actual output files... outfile1 = utils.get_temp_filename('paired-mixed.fq.pe') in_dir = os.path.dirname(outfile1) outfile2 = utils.get_temp_filename('paired-mixed.fq.se', in_dir) script = scriptpath('extract-paired-reads.py') args = [infile] runscript(script, args, in_dir) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert r.accuracy == q.accuracy assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert r.accuracy == q.accuracy assert n > 0
def test_split_paired_reads_1_fa(): # test input file infile = utils.get_test_data('paired.fa') ex_outfile1 = utils.get_test_data('paired.fa.1') ex_outfile2 = utils.get_test_data('paired.fa.2') # actual output files... outfile1 = utils.get_temp_filename('paired.fa.1') in_dir = os.path.dirname(outfile1) outfile2 = utils.get_temp_filename('paired.fa.2', in_dir) script = 'split-paired-reads.py' args = [infile] utils.runscript(script, args, in_dir) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0
def test_extract_partitions_fq(): seqfile = utils.get_test_data('random-20-a.fq') graphbase = _make_graph( seqfile, do_partition=True, annotate_partitions=True) in_dir = os.path.dirname(graphbase) # get the final part file partfile = os.path.join(in_dir, 'random-20-a.fq.part') # ok, now run extract-partitions. script = scriptpath('extract-partitions.py') args = ['extracted', partfile] runscript(script, args, in_dir) distfile = os.path.join(in_dir, 'extracted.dist') groupfile = os.path.join(in_dir, 'extracted.group0000.fq') assert os.path.exists(distfile) assert os.path.exists(groupfile) dist = open(distfile).readline() assert dist.strip() == '99 1 1 99' parts = [r.name.split('\t')[1] for r in screed.open(partfile)] assert len(parts) == 99, len(parts) parts = set(parts) assert len(parts) == 1, len(parts) quals = set([r.accuracy for r in screed.open(partfile)]) quals = list(quals) assert quals[0], quals
def test_split_paired_reads_2_mixed_fq_orphans_to_file(): # test input file infile = utils.copy_test_data('paired-mixed-2.fq') in_dir = os.path.dirname(infile) outfile = utils.get_temp_filename('out.fq') script = 'split-paired-reads.py' args = ['-0', outfile, infile] status, out, err = utils.runscript(script, args, in_dir) assert status == 0 assert "split 6 sequences (3 left, 3 right, 5 orphans)" in err, err n_orphans = len([1 for record in screed.open(outfile)]) assert n_orphans == 5 n_left = len([1 for record in screed.open(infile + '.1')]) assert n_left == 3 n_right = len([1 for record in screed.open(infile + '.2')]) assert n_right == 3 for filename in [outfile, infile + '.1', infile + '.2']: fp = gzip.open(filename) try: fp.read() except IOError as e: assert "Not a gzipped file" in str(e), str(e) fp.close()
def test_split_paired_reads_3_output_files_right(): # test input file infile = utils.get_test_data('paired.fq') ex_outfile1 = utils.get_test_data('paired.fq.1') ex_outfile2 = utils.get_test_data('paired.fq.2') # actual output files... outfile1 = utils.get_temp_filename('paired.fq.1') output_dir = os.path.dirname(outfile1) outfile2 = utils.get_temp_filename('yyy', output_dir) script = 'split-paired-reads.py' args = ['-2', outfile2, '-d', output_dir, infile] utils.runscript(script, args) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert r.quality == q.quality assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert r.quality == q.quality assert n > 0
def test_interleave_read_stdout(): # create input files infile1 = utils.get_test_data('paired-slash1.fq.1') infile2 = utils.get_test_data('paired-slash1.fq.2') # correct output ex_outfile = utils.get_test_data('paired-slash1.fq') # actual output file outfile = utils.get_temp_filename('out.fq') script = 'interleave-reads.py' args = [infile1, infile2] (stats, out, err) = utils.runscript(script, args) with open(outfile, 'w') as ofile: ofile.write(out) n = 0 for r, q in zip(screed.open(ex_outfile), screed.open(outfile)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0
def test_normalize_by_median_paired_fq(): CUTOFF = '20' infile = utils.get_temp_filename('test.fa') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-abund-read-paired.fq'), infile) script = 'normalize-by-median.py' args = ['-C', CUTOFF, '-p', '-k', '17', infile] _, out, err = utils.runscript(script, args, in_dir) print(out) print(err) outfile = infile + '.keep' assert os.path.exists(outfile), outfile seqs = [r.sequence for r in screed.open(outfile)] assert len(seqs) == 6, len(seqs) assert seqs[0].startswith('GGTTGACGGGGCTCAGGGGG'), seqs assert seqs[1].startswith('GGTTGACGGGGCTCAGGG'), seqs names = [r.name for r in screed.open(outfile, parse_description=False)] assert len(names) == 6, names assert '895:1:37:17593:9954 1::FOO' in names, names assert '895:1:37:17593:9954 2::FOO' in names, names
def main(): dbfile = sys.argv[1] mapfile = sys.argv[2] lengths = {} for n, record in enumerate(screed.open(dbfile)): if n % 100000 == 0: print('...', n) lengths[record.name] = len(record.sequence) sums = {} for n, line in enumerate(open(mapfile)): if n % 100000 == 0: print('... 2x', n) x = line.split('\t') name = x[2] readlen = len(x[4]) sums[name] = sums.get(name, 0) + 1 mapped_reads = n rpkms = {} for k in sums: rpkms[k] = sums[k] * (1000. / float(lengths[k])) * \ float(mapped_reads) / 1e6 outfp = open(dbfile + '.cov', 'w') for n, record in enumerate(screed.open(dbfile)): if n % 100000 == 0: print('...', n) print(">%s[cov=%d]\n%s" % (record.name, rpkms.get(record.name, 0), record.sequence), file=outfp)
def test_extract_paired_reads_3_output_dir(): # test input file infile = utils.get_test_data('paired-mixed.fa') ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe') ex_outfile2 = utils.get_test_data('paired-mixed.fa.se') # output directory out_dir = utils.get_temp_filename('output') script = 'extract-paired-reads.py' args = [infile, '-d', out_dir] utils.runscript(script, args) outfile1 = os.path.join(out_dir, 'paired-mixed.fa.pe') outfile2 = os.path.join(out_dir, 'paired-mixed.fa.se') assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0
def fixseqs(filein, fileout): fw = open(fileout, 'w') fi = open(filein, 'r') line1 = fi.readline() #Does line 1 correspond to FASTA? if line1[0] == '>': for n, record in enumerate(screed.open(filein)): name = record['name'] sequence = record['sequence'] fw.write('>%s\n%s\n' % (name, sequence)) #print name, "\n", sequence #Does line 1 correspond to FASTQ? elif line1[0] == '@': for n, record in enumerate(screed.open(filein)): if 'N' in record['annotations']: name = record['name'] + ' ' + record['annotations'] sequence = record['sequence'] accuracy = record['accuracy'] fw.write('@%s\n%s\n+\n%s\n' % (name, sequence, accuracy)) #No FASTA or FASTQ file provided else: print 'Neither fasta nor fastq input. Do your headers start with\n\ > (fasta) or @ (fastq)?' fw.close()
def test_normalize_by_median_paired_fq(): CUTOFF = "20" infile = utils.get_temp_filename("test.fa") in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data("test-abund-read-paired.fq"), infile) script = "normalize-by-median.py" args = ["-C", CUTOFF, "-p", "-k", "17", infile] _, out, err = utils.runscript(script, args, in_dir) print(out) print(err) outfile = infile + ".keep" assert os.path.exists(outfile), outfile seqs = [r.sequence for r in screed.open(outfile)] assert len(seqs) == 6, len(seqs) assert seqs[0].startswith("GGTTGACGGGGCTCAGGGGG"), seqs assert seqs[1].startswith("GGTTGACGGGGCTCAGGG"), seqs names = [r.name for r in screed.open(outfile)] assert len(names) == 6, names assert "895:1:37:17593:9954 1::FOO" in names, names assert "895:1:37:17593:9954 2::FOO" in names, names
def test_split_paired_reads_2_mixed_fq_gzfile(): # test input file infile = utils.get_temp_filename('test.fq') shutil.copyfile(utils.get_test_data('paired-mixed-2.fq'), infile) in_dir = os.path.dirname(infile) outfile = utils.get_temp_filename('out.fq') script = 'split-paired-reads.py' args = ['-0', outfile, '--gzip', infile] status, out, err = utils.runscript(script, args, in_dir) assert status == 0 assert "split 6 sequences (3 left, 3 right, 5 orphans)" in err, err n_orphans = len([1 for record in screed.open(outfile)]) assert n_orphans == 5 n_left = len([1 for record in screed.open(infile + '.1')]) assert n_left == 3 n_right = len([1 for record in screed.open(infile + '.2')]) assert n_right == 3 for filename in [outfile, infile + '.1', infile + '.2']: fp = gzip.open(filename) fp.read() # this will fail if not gzip file. fp.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('genomes') parser.add_argument('reads') args = parser.parse_args() # build a counting label hash + readaligner. lh = khmer.CountingLabelHash(21, 1e7, 4) lh.consume_fasta_and_tag_with_labels(args.genomes) aligner = khmer.ReadAligner(lh.graph, 1, 1.0) names = [] # (labels in 'lh' are in the order of the sequences in the file) for grec in screed.open(args.genomes): names.append(grec.name) print 'loaded two references:', names # run through all the reads, align, and use alignments to look up # the label. for record in screed.open(args.reads): # build alignments against cg _, ga, ra, truncated = aligner.align(record.sequence) if truncated: print 'NO MATCHES', record.name else: # now grab the associated labels labels = lh.sweep_label_neighborhood(ga) # print out the matches. matches = set([ names[i] for i in labels ]) print record.name, 'matches to', ", ".join(matches)
def main(): if len(sys.argv) < 2: sys.stderr.write('*** Usage: python {} <seqfile>\n'.format( os.path.basename(sys.argv[0]))) sys.exit(1) seqfile = sys.argv[1] d = OrderedDict() for rec in screed.open(seqfile): name = rec.name.split(None, 1)[0] seq = rec.sequence if name.endswith('/1') or name.endswith('/2'): name2 = name[:-2] if name2 in d: if d[name2][0] > len(seq): continue d[name2] = len(seq), name else: d[name] = len(seq), name st = set([d[name][-1] for name in d]) for rec in screed.open(seqfile): name = rec.name.split(None, 1)[0] if name in st: sys.stdout.write('>{}\n{}\n'.format(name, rec.sequence))
def test_filter_abund_1(): script = 'filter-abund.py' infile = utils.copy_test_data('test-abund-read-2.fa') n_infile = utils.copy_test_data('test-fastq-n-reads.fq') in_dir = os.path.dirname(infile) n_in_dir = os.path.dirname(n_infile) counting_ht = _make_counting(infile, K=17) n_counting_ht = _make_counting(n_infile, K=17) args = [counting_ht, infile] utils.runscript(script, args, in_dir) outfile = infile + '.abundfilt' n_outfile = n_infile + '.abundfilt' n_outfile2 = n_infile + '2.abundfilt' assert os.path.exists(outfile), outfile seqs = set([r.sequence for r in screed.open(outfile)]) assert len(seqs) == 1, seqs assert 'GGTTGACGGGGCTCAGGG' in seqs args = [n_counting_ht, n_infile] utils.runscript(script, args, n_in_dir) seqs = set([r.sequence for r in screed.open(n_infile)]) assert os.path.exists(n_outfile), n_outfile args = [n_counting_ht, n_infile, '-o', n_outfile2] utils.runscript(script, args, in_dir) assert os.path.exists(n_outfile2), n_outfile2
def test_extract_paired_reads_4_output_files(): # test input file infile = utils.get_test_data('paired-mixed.fa') ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe') ex_outfile2 = utils.get_test_data('paired-mixed.fa.se') # actual output files... outfile1 = utils.get_temp_filename('out_pe') outfile2 = utils.get_temp_filename('out_se') script = 'extract-paired-reads.py' args = [infile, '-p', outfile1, '-s', outfile2] utils.runscript(script, args) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0
def test_gz_open_fastq(): filename1 = os.path.join(os.path.dirname(__file__), 'test.fastq') filename2 = os.path.join(os.path.dirname(__file__), 'test.fastq.gz') for n, (r1, r2) in enumerate(zip(screed.open(filename1), screed.open(filename2))): assert r1.name == r2.name assert n > 0
def test_gz_open(): filename1 = utils.get_test_data('test.fa') filename2 = utils.get_test_data('test.fa.gz') with screed.open(filename1) as f1, screed.open(filename2) as f2: for n, (r1, r2) in enumerate(zip(f1, f2)): assert r1.name == r2.name assert n > 0
def test_bz2_open(): filename1 = utils.get_test_data('test.fa') filename2 = utils.get_test_data('test.fa.bz2') for n, (r1, r2) in enumerate(zip(screed.open(filename1), screed.open(filename2))): assert r1.name == r2.name assert n > 0
def test_gz_open_fastq(): filename1 = utils.get_test_data('test.fastq') filename2 = utils.get_test_data('test.fastq.gz') for n, (r1, r2) in enumerate(zip(screed.open(filename1), screed.open(filename2))): assert r1.name == r2.name assert n > 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('prefix') parser.add_argument('transcripts_file') args = parser.parse_args() prefix = args.prefix filename = args.transcripts_file # first pass: count partition sizes partition_sizes = {} for n, record in enumerate(screed.open(filename, parse_description=0)): if n % 10000 == 0: print '...', n partition = record.name.split()[-1] partition_sizes[partition] = partition_sizes.get(partition, 0) + 1 # show top 10 biggest partitions print '---------------' print 'partition, size' for n, (_, size) in enumerate(sorted(partition_sizes.items(), key=lambda x: -x[1])): print n, size if n == 10: break print '---------------' # now, make a sensible header for each sequence that uniquely ids it partition_sofar = {} seq_id = 1 new_filename = os.path.basename(filename) if new_filename.endswith('.gz'): new_filename = new_filename[:-3] if new_filename.endswith('.fasta'): new_filename = new_filename[:-6] new_filename += '.renamed.fasta.gz' print 'creating', new_filename outfp = gzip.open(new_filename, 'wb') for n, record in enumerate(screed.open(sys.argv[2], parse_description=0)): if n % 10000 == 0: print '...writing', n partition = record.name.split()[-1] sofar = partition_sofar.get(partition, 0) + 1 partition_sofar[partition] = sofar partition_size = partition_sizes[partition] new_name = '%s.id%d.tr%s %d_of_%d_in_tr%s len=%d id=%s tr=%s' % \ (prefix, seq_id, partition, sofar, partition_size, partition, len(record.sequence), seq_id, partition) outfp.write('>%s\n%s\n' % (new_name, record.sequence)) seq_id += 1 print 'total sequences:', n+1 print 'total transcript families:', len(partition_sizes)
def main(): info('interleave-reads.py') args = get_parser().parse_args() for _ in args.infiles: check_file_status(_, args.force) check_space(args.infiles, args.force) s1_file = args.infiles[0] if len(args.infiles) == 2: s2_file = args.infiles[1] else: s2_file = s1_file.replace('_R1_', '_R2_') print >> sys.stderr, ("given only one file; " "guessing that R2 file is %s" % s2_file) fail = False if not os.path.exists(s1_file): print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file fail = True if not os.path.exists(s2_file): print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file fail = True if fail and not args.force: sys.exit(1) print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file) counter = 0 for read1, read2 in itertools.izip(screed.open(s1_file), screed.open(s2_file)): if counter % 100000 == 0: print >> sys.stderr, '...', counter, 'pairs' counter += 1 name1 = read1.name if not name1.endswith('/1'): name1 += '/1' name2 = read2.name if not name2.endswith('/2'): name2 += '/2' assert name1[:-2] == name2[:-2], \ "This doesn't look like paired data! %s %s" % (name1, name2) read1.name = name1 read2.name = name2 write_record(read1, args.output) write_record(read2, args.output) print >> sys.stderr, 'final: interleaved %d pairs' % counter print >> sys.stderr, 'output written to', args.output
def main(): parser = argparse.ArgumentParser(description="Get reads coverage matrix") parser.add_argument('hashname1') parser.add_argument('hashname2') parser.add_argument('file1') parser.add_argument('file2') parser.add_argument('output') args = parser.parse_args() hashname1 = args.hashname1 hashname2 = args.hashname2 output = args.output file1 = args.file1 file2 = args.file2 outfp = open(output, 'w') print 'hashtable from', hashname1 ht1 = khmer.load_counting_hash(hashname1) ht2 = khmer.load_counting_hash(hashname2) matrix1 = {} matrix2 = {} set_x = set() set_y = set() for n, record in enumerate(screed.open(file1)): if n > 0 and n % 100000 == 0:#100000 print '...', n, file1 seq = record.sequence.replace('N', 'A') med1, _, _ = ht1.get_median_count(seq) set_x.add(med1) med2, _, _ = ht2.get_median_count(seq) set_y.add(med2) key = str(med1)+'-'+str(med2) matrix1[key] = matrix1.get(key,0) + 1 for n, record in enumerate(screed.open(file2)): if n > 0 and n % 100000 == 0:#100000 print '...', n, file2 seq = record.sequence.replace('N', 'A') med1, _, _ = ht1.get_median_count(seq) set_x.add(med1) med2, _, _ = ht2.get_median_count(seq) set_y.add(med2) key = str(med1)+'-'+str(med2) matrix2[key] = matrix2.get(key,0) + 1 for x in range(max(list(set_x))): for y in range(max(list(set_y))): to_print = str(x)+'-'+str(y)+' '+ \ str(matrix1.get(str(x)+'-'+str(y),0))+' '+ \ str(matrix2.get(str(x)+'-'+str(y),0))+' '+ \ str(matrix1.get(str(x)+'-'+str(y),0)+matrix2.get(str(x)+'-'+str(y),0))+'\n' outfp.write(to_print) outfp.close()
def read_interleaved_or_paired(fq1, fq2=None): if fq2: r1s = screed.open(fq1) r2s = screed.open(fq2) for r1, r2 in zip(r1s, r2s): yield (r1, r2) else: reads = screed.open(fq1) for r1, r2 in zip(reads, reads): yield (r1, r2)
def count_overlap(K,HT_SIZE,N_HT,filename,filename2,file_result,file_curve): if file_curve !='N': count = 0 for n, record in enumerate(screed.open(filename2)): count = count+1 max_count = count/100 file3 = open(file_curve,'w') ht = khmer.new_hashbits(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(screed.open(filename)): sequence = record['sequence'] seq_len = len(sequence) for n in range(0,seq_len+1-K): kmer = sequence[n:n+K] if (not ht.get(kmer)): n_unique+=1 ht.count(kmer) print filename,'has been consumed.' fpr = (1- math.exp(-n_unique/HT_SIZE))**Z printout1 = "%s:\n# of unique kmers: %n\n# of occupied bin: %n\nfalse positive\ rate: %n" %(filename,n_unique,ht.n_occupied(),fpr) # consume second dataset ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT) n_unique = 0 n_overlap = 0 seq_count = 0 for n, record in enumerate(screed.open(filename2)): sequence = record['sequence'] seq_len = len(sequence) for n in range(0,seq_len+1-K): kmer = sequence[n:n+K] if (not ht2.get(kmer)): n_unique+=1 if (ht.get(kmer)): n_overlap+=1 ht2.count(kmer) if file_curve !='N': seq_count = seq_count + 1 if seq_count == max_count: #n_occu = ht2.n_occupied string = str(n_unique)+' '+str(n_overlap)+'\n' file3 = open(file_curve,'a') file3.write(string) file3.close() seq_count = 0 print filename2,'has been consumed.' fpr = (1- math.exp(-n_unique/HT_SIZE))**Z printout2 = "%s:\n# of unique k-mers: %n\n# of occupied bin: %n\nfalse \ positive rate: %n\n===============\n# of overlap unique k-mers: %n\n" \ %(filename2,n_unique,ht2.n_occupied(),n_overlap) file_result_object = open(file_result,'w') file_result_object.write(printout1) file_result_object.write(printout2)
def count_median(K,HT_SIZE,N_HT,filename,fileout): count = 0 for n, record in enumerate(screed.open(filename)): count = count+1 max_count = count/20 print max_count ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) ht.set_use_bigcount(True) # seq_array = [] seq_count = 0 median_array = [6,7,8,9,10,11,12] med={} for median in median_array: med[median] = 0 #print med count = 0 for n, record in enumerate(screed.open(filename)): sequence = record['sequence'] ht.consume(sequence) # seq_array.append(sequence) seq_count = seq_count + 1 if seq_count == max_count: count = count+1 number_of_sequence_consumed = max_count*count counted_sequence = 0 #print number_of_sequence_consumed for n2,record2 in enumerate(screed.open(filename)): counted_sequence = counted_sequence+1 sequence2 = record2['sequence'] #print sequence2 #for seq in seq_array: a, b, c = ht.get_median_count(sequence2) #print a,b,c for median in median_array: if a == median: #print "hit!" med[a] = med[a]+1 if counted_sequence == number_of_sequence_consumed: break #print med fileout_obj = open(fileout,'a') print_line = str(number_of_sequence_consumed) for median in median_array: print_line = print_line+ '\t'+str(med[median])+'\t' print_line = print_line+'\n' fileout_obj.write(print_line) fileout_obj.close() seq_count = 0 med={} for median in median_array: med[median] = 0
def main(): parser = argparse.ArgumentParser( description='Produce interleaved files from R1/R2 paired files') parser.add_argument('infiles', nargs='+') parser.add_argument('-o', '--output', dest='output', type=argparse.FileType('w'), default=sys.stdout) args = parser.parse_args() s1_file = args.infiles[0] if len(args.infiles) == 2: s2_file = args.infiles[1] else: s2_file = s1_file.replace('_R1_', '_R2_') print >>sys.stderr, "given only one file;" " guessing that R2 file is %s" % s2_file fail = False if not os.path.exists(s1_file): print >>sys.stderr, "Error! R1 file %s does not exist" % s1_file fail = True if not os.path.exists(s2_file): print >>sys.stderr, "Error! R2 file %s does not exist" % s2_file fail = True if fail: sys.exit(-1) print >>sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file) n = 0 for r1, r2 in itertools.izip(screed.open(s1_file), screed.open(s2_file)): if n % 100000 == 0: print >>sys.stderr, '...', n, 'pairs' n += 1 name1 = r1.name if not name1.endswith('/1'): name1 += '/1' name2 = r2.name if not name2.endswith('/2'): name2 += '/2' assert name1[:-2] == name2[:- 2], "This doesn't look like paired data!" " %s %s" % (name1, name2) r1.name = name1 r2.name = name2 args.output.write(output_pair(r1, r2)) print >>sys.stderr, 'final: interleaved %d pairs' % n
def test_sample_reads_randomly_S(): infile = utils.get_temp_filename('test.fq') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile) script = scriptpath('sample-reads-randomly.py') # fix random number seed for reproducibility args = ['-N', '10', '-R', '1', '-S', '3'] badargs = list(args) badargs.extend(['-o', 'test', 'test.fq', 'test.fq']) (status, out, err) = runscript(script, badargs, in_dir, fail_ok=True) assert status == -1, (status, out, err) args.append('test.fq') runscript(script, args, in_dir) outfile = infile + '.subset.0' assert os.path.exists(outfile), outfile seqs = set([r.name for r in screed.open(outfile)]) print seqs assert seqs == set(['895:1:1:1298:13380', '895:1:1:1347:3237', '895:1:1:1295:6189', '895:1:1:1342:11001', '895:1:1:1252:19493', '895:1:1:1318:10532', '895:1:1:1314:10430', '895:1:1:1347:8723', '895:1:1:1381:4958', '895:1:1:1338:6614']) outfile = infile + '.subset.1' assert os.path.exists(outfile), outfile seqs = set([r.name for r in screed.open(outfile)]) print seqs assert seqs == set(['895:1:1:1384:20217', '895:1:1:1347:3237', '895:1:1:1348:18672', '895:1:1:1290:11501', '895:1:1:1386:7536', '895:1:1:1373:13994', '895:1:1:1355:13535', '895:1:1:1303:6251', '895:1:1:1381:4958', '895:1:1:1338:6614']) outfile = infile + '.subset.2' assert os.path.exists(outfile), outfile seqs = set([r.name for r in screed.open(outfile)]) print seqs assert seqs == set(['895:1:1:1326:7273', '895:1:1:1384:20217', '895:1:1:1347:3237', '895:1:1:1353:6642', '895:1:1:1340:19387', '895:1:1:1252:19493', '895:1:1:1381:7062', '895:1:1:1383:3089', '895:1:1:1342:20695', '895:1:1:1303:6251'])
def main(contig1, contig2): ht = count(iterseq(screed.open(contig1)), iterseq(screed.open(contig2)))
def read_partition_file(filename): for record_index, record in enumerate( screed.open(filename, parse_description=False)): _, partition_id = record.name.rsplit('\t', 1) yield record_index, record, int(partition_id)
def watch(args): "Build a signature from raw FASTA/FASTQ coming in on stdin, search." parser = SourmashArgumentParser() parser.add_argument('sbt_name', help='name of SBT to search') parser.add_argument('inp_file', nargs='?', default='/dev/stdin') parser.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') parser.add_argument('-o', '--output', type=argparse.FileType('wt'), help='save signature generated from data here') parser.add_argument('--threshold', default=0.05, type=float, help='minimum threshold for matches (default=0.05)') parser.add_argument( '--input-is-protein', action='store_true', help='Consume protein sequences - no translation needed') sourmash_args.add_construct_moltype_args(parser) parser.add_argument( '-n', '--num-hashes', type=int, default=DEFAULT_N, help='number of hashes to use in each sketch (default: %(default)i)') parser.add_argument('--name', type=str, default='stdin', help='name to use for generated signature') sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K) args = parser.parse_args(args) set_quiet(args.quiet) if args.input_is_protein and args.dna: notify('WARNING: input is protein, turning off nucleotide hashing.') args.dna = False args.protein = True if args.dna and args.protein: notify('ERROR: cannot use "watch" with both nucleotide and protein.') if args.dna: moltype = 'DNA' is_protein = False dayhoff = False elif args.protein: moltype = 'protein' is_protein = True dayhoff = False else: moltype = 'dayhoff' is_protein = True dayhoff = True tree = load_sbt_index(args.sbt_name) # check ksize from the SBT we are loading ksize = args.ksize if ksize is None: leaf = next(iter(tree.leaves())) tree_mh = leaf.data.minhash ksize = tree_mh.ksize E = MinHash(ksize=ksize, n=args.num_hashes, is_protein=is_protein, dayhoff=dayhoff) streamsig = sig.SourmashSignature(E, filename='stdin', name=args.name) notify('Computing signature for k={}, {} from stdin', ksize, moltype) def do_search(): search_fn = SearchMinHashesFindBest().search results = [] for leaf in tree.find(search_fn, streamsig, args.threshold): results.append((streamsig.similarity(leaf.data), leaf.data)) return results notify('reading sequences from stdin') screed_iter = screed.open(args.inp_file) watermark = WATERMARK_SIZE # iterate over input records n = 0 for n, record in enumerate(screed_iter): # at each watermark, print status & check cardinality if n >= watermark: notify('\r... read {} sequences', n, end='') watermark += WATERMARK_SIZE if do_search(): break if args.input_is_protein: E.add_protein(record.sequence) else: E.add_sequence(record.sequence, False) results = do_search() if not results: notify('... read {} sequences, no matches found.', n) else: results.sort(key=lambda x: -x[0]) # take best similarity, found_sig = results[0] print_results('FOUND: {}, at {:.3f}', found_sig.name(), similarity) if args.output: notify('saving signature to {}', args.output.name) sig.save_signatures([streamsig], args.output)
import sys import screed with open('../../../bagel-main/roster.csv') as roster: haves = [line.split(',')[1] for line in roster.readlines()] #print haves print 'mutant_label,oligo_label,sequence,scale,purification' for line in sys.stdin: if line.strip() not in haves: for record in screed.open('../../oligos/{}.fasta'.format( line.strip())): print '{0},{0},{1},25nm,standard'.format(line.strip(), record.sequence)
def main(): parser = argparse.ArgumentParser(description='XXX') env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K) env_n_hashes = os.environ.get('KHMER_N_HASHES', DEFAULT_N_HT) env_hashsize = os.environ.get('KHMER_MIN_HASHSIZE', DEFAULT_MIN_HASHSIZE) parser.add_argument('--ksize', '-k', type=int, dest='ksize', default=env_ksize, help='k-mer size to use') parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes', default=env_n_hashes, help='number of hash tables to use') parser.add_argument('--hashsize', '-x', type=float, dest='min_hashsize', default=env_hashsize, help='lower bound on hashsize to use') parser.add_argument("--trusted-cov", dest="trusted_cov", type=int, default=DEFAULT_CUTOFF) parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0) parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='base cutoff on median k-mer abundance of this', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('--tempdir', '-T', type=str, dest='tempdir', default='./') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes NORMALIZE_LIMIT = args.normalize_to print('making hashtable') ht = khmer.CountingHash(K, HT_SIZE, N_HT) aligner = khmer.ReadAligner(ht, args.trusted_cov, args.bits_theta) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print('created temporary directory %s; use -T to change location' % tempdir) ### save_pass2 = 0 n_aligned = 0 n_corrected = 0 total_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) corrfilename = os.path.basename(filename) + '.corr' pass2list.append((filename, pass2filename, corrfilename)) pass2fp = open(pass2filename, 'w') corrfp = open(corrfilename, 'w') for n, read in enumerate(screed.open(filename)): total_reads += 1 if n % 10000 == 0: print('...', n, filename, n_aligned, n_corrected, save_pass2, \ total_reads) seq = read.sequence.replace('N', 'A') # build the alignment... score, graph_alignment, read_alignment, truncated = \ aligner.align(read.sequence) # next, decide whether or to keep it. output_corrected = False if not truncated: n_aligned += 1 # build a better sequence -- this is the corrected one. if True: graph_seq = graph_alignment.replace("-", "") else: graph_seq = "" for i in range(len(graph_alignment)): if graph_alignment[i] == "-": graph_seq += read_alignment[i] else: graph_seq += graph_alignment[i] corrected = graph_seq if graph_seq != read.sequence: n_corrected += 1 # get the minimum count for this new sequence mincount = ht.get_min_count(graph_seq) if mincount < args.normalize_to: output_corrected = True # has this portion of the graph saturated? if not, # consume & save => pass2. if output_corrected: corrfp.write(output_single(read, corrected)) else: # uncorrected... ht.consume(read.sequence) pass2fp.write(output_single(read, read.sequence)) save_pass2 += 1 pass2fp.close() corrfp.close() print('%s: kept aside %d of %d from first pass, in %s' % \ (filename, save_pass2, n, filename)) print('aligned %d of %d reads so far' % (n_aligned, total_reads)) print('changed %d of %d reads so far' % (n_corrected, total_reads)) for orig_filename, pass2filename, corrfilename in pass2list: print('second pass: looking at sequences kept aside in %s' % \ pass2filename) for n, read in enumerate(screed.open(pass2filename)): if n % 10000 == 0: print('... x 2', n, pass2filename, n_aligned, n_corrected, \ total_reads) corrfp = open(corrfilename, 'a') # build the alignment... score, graph_alignment, read_alignment, truncated = \ aligner.align(read.sequence) if truncated: # no good alignment; output original corrected = read.sequence else: n_aligned += 1 # build a better sequence -- this is the corrected one. if True: graph_seq = graph_alignment.replace("-", "") else: graph_seq = "" for i in range(len(graph_alignment)): if graph_alignment[i] == "-": graph_seq += read_alignment[i] else: graph_seq += graph_alignment[i] corrected = graph_seq if corrected != read.sequence: n_corrected += 1 corrfp.write(output_single(read, corrected)) print('removing %s' % pass2filename) os.unlink(pass2filename) print('removing temp directory & contents (%s)' % tempdir) shutil.rmtree(tempdir) print('Aligned %d of %d total' % (n_aligned, total_reads)) print('Changed %d of %d total' % (n_corrected, total_reads))
def main(): if len(sys.argv) != 4: mes = '*** python {} size <check-kmer-distance-py-output> <contigs.fa>' print(mes.format(os.path.basename(sys.argv[0])), file=sys.stderr) sys.exit(1) size = int(sys.argv[1]) infile = sys.argv[2] contigf = sys.argv[3] d = parse_kmer_distance(infile) print(('#contig_name\tcontig_len\tf_start\tr_start\tf_seq\tf_tm\tf_gc\t' 'r_seq\tr_tm\tr_gc\tta\tamp_size')) pair_pass = 0 for rec in screed.open(contigf): name = rec.name if not name in d: continue seq = rec.sequence for f_p, r_p in d[name]: assert len(seq) > r_p, '*** seq length < forward primer position' f = seq[f_p:(f_p + size)] r = RC(seq[r_p:(r_p + size)]) # primer3 functions only accept byte-strings f = f.encode('utf-8') #f = bytes(f, 'utf-8') r = r.encode('utf-8') #r = bytes(r, 'utf-8') if has_ambiguous(f) or has_ambiguous(r): continue # check tm f_tm = primer3.calcTm(f) if f_tm < TM_LOWER or f_tm > TM_UPPER: continue r_tm = primer3.calcTm(r) if r_tm < TM_LOWER or r_tm > TM_UPPER: continue if abs(f_tm - r_tm) > TM_DIFF_MAX: continue # check gc f_gc = check_GC(f) if f_gc < GC_LOWER or f_gc > GC_UPPER: continue r_gc = check_GC(r) if r_gc < GC_LOWER or r_gc > GC_UPPER: continue amp = seq[f_p:(r_p + size)].encode('utf-8') amp_tm = primer3.calcTm(amp) ta = 0.3 * min(f_tm, r_tm) + 0.7 * amp_tm - 14.9 # premierbiosoft #ta = 0.3*min(f_tm,r_tm) + 0.7*amp_tm - 25 # IDT recommendation ### thermodynamics check ### skipping here as loose filter # check hairpin and homodimer if SS: f_hp = primer3.calcHairpin(f) f_ho = primer3.calcHomodimer(f) if f_hp.dg < HP_DG_LIMIT or f_hp.dg > 0: continue if f_hp.tm > ta: continue if f_ho.dg < DI_DG_LIMIT or f_ho.dg > 0: #print('+++++>', f_ho.dg) continue r_hp = primer3.calcHairpin(r) r_ho = primer3.calcHomodimer(r) if r_hp.dg < HP_DG_LIMIT or r_ho.dg > 0: continue if r_hp.tm > ta: continue if r_ho.dg < DI_DG_LIMIT or r_ho.dg > 0: #print('=====>', r_ho.dg) continue # check heterodimer hetero = primer3.calcHeterodimer(f, r) if hetero.dg < DI_DG_LIMIT: continue pair_pass += 1 # forward, f_tm, f_gc, reverse, r_tm, r_gc, ta, amp_size mes = ('{}\t{}\t{}\t{}\t{}\t{:.1f}\t{:.2f}\t{}\t{:.1f}\t' '{:.2f}\t{}\t{}') print( mes.format(name, len(seq), f_p, r_p, f, f_tm, f_gc, r, r_tm, r_gc, ta, len(amp))) print('*** Pairs passed: {}'.format(pair_pass), file=sys.stderr)
def main(): p = argparse.ArgumentParser() p.add_argument('genome_files', nargs='+') p.add_argument('-o', '--output-csv', required=True) p.add_argument('-d', '--output-directory', required=True) args = p.parse_args() output_fp = open(args.output_csv, 'wt') w = csv.DictWriter(output_fp, fieldnames=['ident', 'display_name', 'genome_filename']) w.writeheader() try: os.mkdir(args.output_directory) print(f"Created genome directory '{args.output_directory}'") except FileExistsError: print(f"Genome directory '{args.output_directory}' already exists.") print(f"Copying genomes into '{args.output_directory}'") n = 0 for filename in args.genome_files: print(f"---") print(f"processing genome '{filename}'") for record in screed.open(filename): record_name = record.name break ident, *remainder = record_name.split(' ', 1) if remainder: # is list, needs to be string remainder = remainder[0] else: remainder = ident print(f"read identifer '{ident}' and name '{remainder}'") destfile = os.path.join(args.output_directory, f"{ident}_genomic.fna.gz") is_gzipped = False with contextlib.suppress(OSError): with gzip.open(filename) as fp: fp.read(1) is_gzipped = True if is_gzipped: print(f"copying '{filename}' to '{destfile}'") shutil.copyfile(filename, destfile) else: print(f"compressing '{filename}' into '{destfile}'") with open(filename, 'rb') as fp: with gzip.open(destfile, 'w') as outfp: outfp.write(fp.read()) w.writerow( dict(ident=ident, display_name=remainder, genome_filename=destfile)) n += 1 output_fp.close() print('---') print(f"wrote {n} genome entries to '{args.output_csv}'") return 0
import screed, sys ''' pip install screed ''' fp1 = open(sys.argv[2] + '.pe1.fq', 'w') fp2 = open(sys.argv[2] + '.pe2.fq', 'w') n = 0 for record in screed.open(sys.argv[1]): if n % 2 == 0: fp1.write('%s\n' % ('@' + record.name)) fp1.write('%s\n' % record.sequence) fp1.write('%s\n' % '+') fp1.write('%s\n' % record.quality) else: fp2.write('%s\n' % ('@' + record.name)) fp2.write('%s\n' % record.sequence) fp2.write('%s\n' % '+') fp2.write('%s\n' % record.quality) n = n + 1
print 'loading ht' ht = khmer.load_counting_hash(hashfile) print '...done!' K = ht.ksize() print 'loaded ht; K is %d, n_ht is %d, size ~ %g' % (K, len(ht.hashsizes()), ht.hashsizes()[0]) outfp = gzip.open(output, 'w') total = 0 total_masked = 0 for n, record in enumerate(screed.open(filename)): if n % 1000 == 0: print '...', n x = [] seq = record.sequence total += len(seq) - K + 1 pos = 0 while pos < len(seq) - K + 1: kmer = seq[pos:pos + K] if 'N' in kmer.upper(): x.extend(kmer) pos += K continue
def main(): parser = build_counting_args() parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-p', '--paired', action='store_true') parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('-R', '--report-to-file', dest='report_file', type=argparse.FileType('w')) parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MAX_HASHSIZE and not args.loadhash: print( "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!", file=sys.stderr) print('\nPARAMETERS:', file=sys.stderr) print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr) print(' - n hashes = %d \t\t(-N)' % args.n_hashes, file=sys.stderr) print(' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize, file=sys.stderr) print(' - paired = %s \t\t(-p)' % args.paired, file=sys.stderr) print('', file=sys.stderr) print( 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize), file=sys.stderr) print('-' * 8, file=sys.stderr) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes DESIRED_COVERAGE = args.cutoff report_fp = args.report_file filenames = args.input_filenames # In paired mode we read two records at a time batch_size = 1 if args.paired: batch_size = 2 if args.loadhash: print('loading hashtable from', args.loadhash) ht = khmer.load_counting_hash(args.loadhash) else: print('making hashtable') ht = khmer.CountingHash(K, HT_SIZE, N_HT) total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keepmedpct' outfp = open(output_name, 'w') n = -1 for n, batch in enumerate( batchwise(screed.open(input_filename), batch_size)): if n > 0 and n % 100000 == 0: print('... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%') print('... in file', input_filename) if report_fp: print(total, total - discarded, \ 1. - (discarded / float(total)), file=report_fp) report_fp.flush() total += batch_size # If in paired mode, check that the reads are properly interleaved if args.paired: if not validpair(batch[0], batch[1]): print('Error: Improperly interleaved pairs %s %s' % (batch[0].name, batch[1].name), file=sys.stderr) sys.exit(-1) # Emit the batch of reads if any read passes the filter # and all reads are longer than K passed_filter = False passed_length = True for record in batch: if len(record.sequence) < K: passed_length = False continue seq = record.sequence.replace('N', 'A') med, avg, dev = ht.get_median_count(seq) pct = 0. if avg: pct = dev / avg * 100 if med < DESIRED_COVERAGE and pct < 100: ht.consume(seq) passed_filter = True # Emit records if any passed if passed_length and passed_filter: for record in batch: if hasattr(record, 'quality'): outfp.write( '@%s\n%s\n+\n%s\n' % (record.name, record.sequence, record.quality)) else: outfp.write('>%s\n%s\n' % (record.name, record.sequence)) else: discarded += batch_size if -1 < n: print('DONE with', input_filename, '; kept', total - discarded, 'of',\ total, 'or', int(100. - discarded / float(total) * 100.), '%') print('output in', output_name) else: print('SKIPPED empty file', input_filename) if args.savehash: print('Saving hashfile through', input_filename) print('...saving to', args.savehash) ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print('fp rate estimated to be %1.3f' % fp_rate) if fp_rate > 0.20: print("**", file=sys.stderr) print("** ERROR: the counting hash is too small for", file=sys.stderr) print("** this data set. Increase hashsize/num ht.", file=sys.stderr) print("**", file=sys.stderr) print("** Do not use these results!!", file=sys.stderr) sys.exit(-1)
#! /usr/bin/env python import screed, sys fp = open('test-contigs.fa', 'wt') for n, record in enumerate(screed.open('63.fa')): for i in range(0, len(record.sequence), 100000): fragment = record.sequence[i:i + 100000] fp.write(f'>seq{n}.{i}\n{fragment}\n') fp.close()
#!/usr/bin/env python from khmer.utils import write_record import screed import sys mutations = { 0: (29, 19, 'T', 'A'), 1: (19, 41, 'G', 'A'), 2: (67, 5, 'G', 'T'), 3: (63, 20, 'T', 'C'), } readnum = 0 for n, record in enumerate(screed.open(sys.argv[1])): if n in mutations: refrstart, mismatchpos, origbase, newbase = mutations[n] readseq = record.sequence[refrstart:refrstart + 50] assert readseq[mismatchpos] == origbase mutseq = readseq[:mismatchpos] + newbase + readseq[mismatchpos + 1:] oldseqname = record.name.split('-')[-1] readnum += 1 record.name = 'read{}_{}_exact'.format(readnum, oldseqname) record.sequence = readseq record.quality = '3' * 50 write_record(record, sys.stdout) readnum += 1 record.name = 'read{}_{}_mismatch'.format(readnum, oldseqname) record.sequence = mutseq
def main(): # pylint: disable=too-many-branches,too-many-statements info('normalize-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: print('ERROR: Duplicate filename--Cannot handle this!', file=sys.stderr) print('** Exiting!', file=sys.stderr) sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable(args, 'countgraph', args.force) # load or create counting table. if args.loadtable: print('loading k-mer counting table from ' + args.loadtable, file=sys.stderr) htable = khmer.load_counting_hash(args.loadtable) else: print('making countgraph', file=sys.stderr) htable = khmer_args.create_countgraph(args) input_filename = None # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, htable) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for e in filenames: files.append([e, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: if args.single_output_file is sys.stdout: output_name = '/dev/stdout' else: output_name = args.single_output_file.name outfp = args.single_output_file # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'w') # failsafe context manager in case an input file breaks with CatchIOErrors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = screed.open(filename, parse_description=False) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in WithDiagnostics(filename, norm, reader, report_fp): if record is not None: write_record(record, outfp) print('output in ' + output_name, file=sys.stderr) if output_name is not '/dev/stdout': outfp.close() # finished - print out some diagnostics. print('Total number of unique k-mers: {0}'.format(htable.n_unique_kmers()), file=sys.stderr) if args.savetable: print('...saving to ' + args.savetable, file=sys.stderr) htable.save(args.savetable) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) if args.force and len(corrupt_files) > 0: print("** WARNING: Finished with errors!", file=sys.stderr) print("** I/O Errors occurred in the following files:", file=sys.stderr) print("\t", " ".join(corrupt_files), file=sys.stderr)
def main(): if len(sys.argv) != 3: mes = '*** Usage: python {} params.config file.uniq2ref.primer' print( mes.format(os.path.basename(sys.argv[0])), file=sys.stderr, ) sys.exit(1) configf = sys.argv[1] primerfile = sys.argv[2] d = yaml.load(open(configf)) pass_cnt = 0 total_cnt = 0 for rec in screed.open(primerfile): total_cnt += 1 _name = rec.name name, _contig = _name.split(None, 1) contig_len = _contig.split('__', 1)[1] seq = rec.sequence seq_rc = RC(seq) # primer3 functions only accept byte-strings seq = seq.encode('utf-8') seq_rc = seq_rc.encode('utf-8') #seq = bytes(seq, 'utf-8') trig = False for di, seq in zip(('f', 'r'), (seq, seq_rc)): if has_ambiguous(seq): continue # check tm tm = primer3.calcTm(seq) if tm < d['TM_LOWER'] or tm > d['TM_UPPER']: continue # check gc gc = check_gc(seq) if gc < d['GC_LOWER'] or gc > d['GC_UPPER']: continue if d['GC_CLAMP']: cnt = end_gc_count(seq) if cnt > 3 or cnt < 1: continue if d['SS']: hp = primer3.calcHairpin(seq) ho = primer3.calcHomodimer(seq) if hp.dg < d['HP_DG_LIMIT'] or hp.dg > 0: continue if ho.dg < d['DI_DG_LIMIT'] or ho.dg > 0: continue trig = True mes = '>{}__{} contiglen__{};di__{};tm__{};gc__{}\n{}' print( mes.format(name, di, contig_len, di, tm, gc, seq), file=sys.stdout, ) if trig: pass_cnt += 1 mes = '*** # of primers (at least one direction) passed filter: {}' print(mes.format(pass_cnt), file=sys.stderr) if total_cnt == 0: mes = ('*** Empty file detected: {} (file.uniq2ref.primer), ' 'skipping..') print( mes.format(os.path.basename(primerfile)), file=sys.stderr, ) sys.exit(0)
N_HT = 4 THRESHOLD = 0.9 filename1 = sys.argv[1] filename2 = sys.argv[2] uniq1 = open(os.path.basename(sys.argv[1]) + '.uniq', 'w') uniq2 = open(os.path.basename(sys.argv[2]) + '.uniq', 'w') paths = sys.argv[3] kh1 = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) kh1.consume_fasta(filename1) kh2 = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) kh2.consume_fasta(filename2) for record in screed.open(paths): n = 0 n_present = 0 path = record.sequence n = len(path) - K + 1 for i in range(n): if kh1.get(path[i:i + K]): n_present += 1 if n_present / float(n) >= THRESHOLD: present1 = True else: present1 = False n = 0
def main(): parser = build_construct_args() parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('-R', '--report-to-file', dest='report_file', type=argparse.FileType('w')) parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >> sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print >> sys.stderr, '\nPARAMETERS:' print >> sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >> sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >> sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print >> sys.stderr, '' print >> sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % ( args.n_hashes * args.min_hashsize) print >> sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes DESIRED_COVERAGE = args.cutoff report_fp = args.report_file filenames = args.input_filenames if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keep' outfp = open(output_name, 'w') for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename if report_fp: print>>report_fp, total, total - discarded, \ 1. - (discarded / float(total)) report_fp.flush() total += 1 if len(record.sequence) < K: continue seq = record.sequence.replace('N', 'A') med, _, _ = ht.get_median_count(seq) if med < DESIRED_COVERAGE: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, record.sequence)) else: discarded += 1 print 'DONE with', input_filename, '; kept', total - discarded, 'of',\ total, 'or', int(100. - discarded / float(total) * 100.), '%' print 'output in', output_name if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the counting hash is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): parser = build_counting_args() parser.add_argument("-t", "--trusted-cutoff", dest="trusted_cutoff", type=int, default=3) parser.add_argument( "--bits-theta", help= "Tuning parameter controlling trade off of speed vs alignment sensitivity", default=1.0, type=float, dest="bits_theta") parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_MINIMUM_COVERAGE) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('--details-out', dest="details_out") parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: print >> sys.stderr, '\nPARAMETERS:' print >> sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >> sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_tables print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ args.min_tablesize print >> sys.stderr, '' print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize)' % ( args.n_tables * args.min_tablesize) print >> sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_tablesize N_HT = args.n_tables DESIRED_COVERAGE = args.cutoff filenames = args.input_filenames if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) aligner = khmer.new_readaligner(ht, args.trusted_cutoff, args.bits_theta) if args.details_out != None: details_out = open(args.details_out, "w") else: details_out = None total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keepalign' outfp = open(output_name, 'w') for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename total += 1 if len(record.sequence) < K: continue seq = record.sequence.upper().replace('N', 'A') ## score, graph_alignment, read_alignment, truncated = aligner.align( record.sequence) keep = False if truncated: keep = True else: if False: graph_seq = graph_alignment.replace("-", "") else: graph_seq = "" for i in range(len(graph_alignment)): if graph_alignment[i] == "-": graph_seq += read_alignment[i] else: graph_seq += graph_alignment[i] mincount = ht.get_min_count(graph_seq) keep = True seq = graph_seq #if mincount < DESIRED_COVERAGE: # keep = True # seq = graph_seq #else: # assert not keep if details_out != None: details_out.write( "+{7}\t{0:0.2f}\t{3}\t{4}\nread: {6}\ngraph_aln: {1}\nread_aln: {2}\nstored_seq:{5}\n" .format(score, graph_alignment, read_alignment, truncated, keep, seq, record.sequence, record.name)) if keep: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, seq)) else: discarded += 1 if total: print 'DONE with', input_filename, '; kept', total - discarded, 'of',\ total, 'or', int(100. - discarded / float(total) * 100.), '%' print 'output in', output_name if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the counting hash is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): parser = build_counting_args() parser.add_argument("-t", "--trusted-cutoff", dest="trusted_cutoff", type=int, default=3) parser.add_argument("--bits-theta", help="Tuning parameter controlling" "trade off of speed vs alignment sensitivity", default=1.0, type=float, dest="bits_theta") parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='base cutoff on abundance', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('--details-out', dest="details_out") parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: print('\nPARAMETERS:', file=sys.stderr) print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr) print(' - n hashes = %d \t\t(-N)' % args.n_tables, file=sys.stderr) print(' - min hashsize = %-5.2g \t(-x)' % \ args.max_tablesize, file=sys.stderr) print('', file=sys.stderr) print('Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize)' % \ (args.n_tables * args.max_tablesize), file=sys.stderr) print('-' * 8, file=sys.stderr) K = args.ksize HT_SIZE = args.max_tablesize N_HT = args.n_tables filenames = args.input_filenames if args.loadhash: print('loading hashtable from', args.loadhash) ht = khmer.load_counting_hash(args.loadhash) else: print('making hashtable') ht = khmer.CountingHash(K, HT_SIZE, N_HT) aligner = khmer.ReadAligner(ht, args.trusted_cutoff, args.bits_theta) if args.details_out is not None: details_out = open(args.details_out, "w") else: details_out = None total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keepvar' outfp = open(output_name, 'w') for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: print('... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%') print('... in file', input_filename) total += 1 if len(record.sequence) < K: continue seq = record.sequence.upper().replace('N', 'A') ## # build the alignment... score, graph_alignment, read_alignment, truncated = \ aligner.align(record.sequence) # next, decide whether or to keep it. keep = False if truncated: keep = True # keep all truncated alignments - why? else: # build a better sequence -- this is the corrected one. graph_seq = graph_alignment.replace("-", "") # OR? #graph_seq = "" #for i in range(len(graph_alignment)): # if graph_alignment[i] == "-": # graph_seq += read_alignment[i] # else: # graph_seq += graph_alignment[i] # get the minimum count for this new sequence mincount = ht.get_min_count(graph_seq) if mincount < args.normalize_to: keep = True if details_out is not None: details_out.write( "+{7}\t{0:0.2f}\t{3}\t{4}\nread: " "{6}\ngraph_aln: {1}\nread_aln: {2}\nstored_seq:{5}\n" "".format(score, graph_alignment, read_alignment, truncated, keep, seq, record.sequence, record.name)) if keep: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, record.sequence)) else: discarded += 1 if total: print('DONE with', input_filename, \ '; kept', total - discarded, 'of', total, 'or', \ int(100. - discarded / float(total) * 100.), '%') print('output in', output_name) if args.savehash: print('Saving hashfile through', input_filename) print('...saving to', args.savehash) ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht, args.force, max_false_pos=.2) print('fp rate estimated to be %1.3f' % fp_rate)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('bcalm_unitigs') parser.add_argument('gxt_out') parser.add_argument('contigs_out') parser.add_argument('-k', '--ksize', type=int, default=31) parser.add_argument('-d', '--debug', action='store_true') parser.add_argument('-P', '--pendants', action="store_true", help="don't remove low abundance pendants") args = parser.parse_args(argv) ksize = args.ksize trim = not args.pendants # track links between contig IDs link_d = collections.defaultdict(set) gxtfp = open(args.gxt_out, 'wt') contigsfp = bgzf.open(args.contigs_out, 'wb') info_filename = args.contigs_out + '.info.csv' info_fp = open(info_filename, 'wt') # track offsets, mean abunds, and # k-mers for each contig offsets = {} mean_abunds = {} sizes = {} sequences = {} # walk the input unitigs file, tracking links between contigs and # writing them to contigs_out. max_contig_id = 0 print('reading unitigs from {}'.format(args.bcalm_unitigs)) for n, record in enumerate(screed.open(args.bcalm_unitigs)): if n % 10000 == 0: print('...', n, file=sys.stderr, end='\r') name = record.name name_split = name.split() # note: contig_id may not be in order. contig_id = int(name_split[0]) # track the various links links = [x for x in name_split[1:] if x.startswith('L:')] link_ids = [x.split(':')[2] for x in links] link_ids = [int(x) for x in link_ids] if args.debug: print('link_ids for {} are {}'.format(contig_id, link_ids)) link_d[contig_id].update(link_ids) # get mean abund abund = [x for x in name_split[1:] if x.startswith('km:')] assert len(abund) == 1, abund abund = abund[0].split(':') assert len(abund) == 3 abund = float(abund[2]) mean_abunds[contig_id] = abund # where are we in the output file? assert contig_id not in offsets sequences[contig_id] = record.sequence sizes[contig_id] = len(record.sequence) - ksize + 1 # if we are removing pendants, we need to relabel the contigs so they are # consecutive integers starting from 0. If not, we create dummy data # structures to make the interface the same elsewhere in the data if trim: non_pendants = [x for x, N in link_d.items() if len(N) > 1 and \ mean_abunds[x] > TRIM_CUTOFF] else: non_pendants = list(link_d.keys()) aliases = {x: i for i, x in enumerate(non_pendants)} n = len(aliases) for x, i in aliases.items(): offsets[x] = contigsfp.tell() contigsfp.write('>{}\n{}\n'.format(i, sequences[x])) contigsfp.close() print('... done! {} unitigs'.format(n)) # start the gxt file by writing the number of nodes (unitigs)) gxtfp.write('{}\n'.format(n)) # write out all of the links, in 'from to' format. n_edges = 0 for node, edgelist in link_d.items(): if node not in aliases: continue for next_node in edgelist: if next_node not in aliases: continue gxtfp.write('{} {}\n'.format(aliases[node], aliases[next_node])) n_edges += 1 print('{} vertices, {} edges'.format(n, n_edges)) info_fp.write('contig_id,offset,mean_abund,n_kmers\n') for v, i in aliases.items(): info_fp.write('{},{},{:.3f},{}\n'.format(i, offsets[v], mean_abunds[v], sizes[v]))
def main(): info('sweep-reads-buffered.py', ['sweep']) parser = get_parser() args = parser.parse_args() if args.min_tablesize < MIN_HSIZE: args.min_tablesize = MIN_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE report_on_config(args, hashtype='hashbits') K = args.ksize HT_SIZE = args.min_tablesize N_HT = args.n_tables traversal_range = args.traversal_range input_fastp = args.input_fastp if not args.outdir: outdir = os.path.dirname(input_fastp) else: outdir = args.outdir max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size max_reads = args.max_reads check_file_status(args.input_fastp, args.force) check_valid_file_exists(args.input_files) all_input_files = [input_fastp] all_input_files.extend(args.input_files) # Check disk space availability check_space(all_input_files, args.force) # figure out input file type (FA/FQ) -- based on first file ix = iter(screed.open(args.input_files[0])) record = ix.next() del ix extension = 'fa' if hasattr(record, 'accuracy'): # fastq! extension = 'fq' output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir, extension) # consume the partitioned fasta with which to label the graph ht = khmer.LabelHash(K, HT_SIZE, N_HT) try: print >> sys.stderr, 'consuming input sequences...' if args.label_by_pid: print >> sys.stderr, '...labeling by partition id (pid)' ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) elif args.label_by_seq: print >> sys.stderr, '...labeling by sequence' for n, record in enumerate(screed.open(input_fastp)): if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, n) else: print >>sys.stderr, \ '...labeling to create groups of size {s}'.format( s=args.group_size) label = -1 g = 0 try: outfp = open( '{pref}_base_{g}.{ext}'.format(pref=output_pref, g=g, ext=extension), 'wb') for n, record in enumerate(screed.open(input_fastp)): if n % args.group_size == 0: label += 1 if label > g: g = label outfp = open( '{pref}_base_{g}.{ext}'.format( pref=output_pref, g=g, ext=extension), 'wb') if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels( record.sequence, label) if hasattr(record, 'accuracy'): outfp.write('@{name}\n{seq}+{accuracy}\n'.format( name=record.name, seq=record.sequence, accuracy=record.accuracy)) else: outfp.write('>{name}\n{seq}\n'.format( name=record.name, seq=record.sequence)) except IOError as e: print >> sys.stderr, '!! ERROR !!', e print >> sys.stderr, '...error splitting input. exiting...' except IOError as e: print >> sys.stderr, '!! ERROR: !!', e print >> sys.stderr, '...error consuming \ {i}. exiting...'.format(i=input_fastp) print >> sys.stderr, 'done consuming input sequence. \ added {t} tags and {l} \ labels...'.format(t=ht.n_tags(), l=ht.n_labels()) label_dict = defaultdict(int) label_number_dist = [] n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 total_t = time.clock() start_t = time.clock() for read_file in args.input_files: print >> sys.stderr, '** sweeping {read_file} for labels...'.format( read_file=read_file) file_t = 0.0 try: read_fp = screed.open(read_file) except IOError as error: print >> sys.stderr, '!! ERROR: !!', error print >> sys.stderr, '*** Could not open {fn}, skipping...'.format( fn=read_file) else: for _, record in enumerate(read_fp): if _ % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \ {no} orphaned] \ ** {sec}s ({sect}s total)' \ .format(n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t) start_t = time.clock() seq = record.sequence name = record.name try: labels = ht.sweep_label_neighborhood(seq, traversal_range) except ValueError as e: pass else: if hasattr(record, 'accuracy'): seq_str = fmt_fastq(name, seq, record.accuracy, labels) else: seq_str = fmt_fasta(name, seq, labels) label_number_dist.append(len(labels)) if labels: n_labeled += 1 if len(labels) > 1: output_buffer.queue(seq_str, 'multi') n_mlabeled += 1 label_dict['multi'] += 1 else: output_buffer.queue(seq_str, labels[0]) label_dict[labels[0]] += 1 else: n_orphaned += 1 output_buffer.queue(seq_str, 'orphaned') label_dict['orphaned'] += 1 print >> sys.stderr, '** End of file {fn}...'.format(fn=read_file) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! print >> sys.stderr, '** End of run...' output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print >> sys.stderr, '! WARNING: Sweep finished with errors !' print >> sys.stderr, '** {writee} reads not written'.format( writee=output_buffer.num_write_errors) print >> sys.stderr, '** {filee} errors opening files'.format( filee=output_buffer.num_file_errors) print >> sys.stderr, 'swept {n_reads} for labels...'.format( n_reads=n_labeled + n_orphaned) print >> sys.stderr, '...with {nc} labeled and {no} orphaned'.format( nc=n_labeled, no=n_orphaned) print >> sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled) print >> sys.stderr, '** outputting label number distribution...' fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref)) with open(fn, 'wb') as outfp: for nc in label_number_dist: outfp.write('{nc}\n'.format(nc=nc)) fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref)) print >> sys.stderr, '** outputting label read counts...' with open(fn, 'wb') as outfp: for k in label_dict: outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
def main(): info('collect-reads.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print 'Saving k-mer counting table to %s' % base print 'Loading sequences from %s' % repr(filenames) if args.output: print 'Outputting sequences to', args.output print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize) htable.set_use_bigcount(args.bigcount) total_coverage = 0. n = 0 for index, filename in enumerate(filenames): for record in screed.open(filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') try: med, _, _ = htable.get_median_count(seq) except ValueError: continue total_coverage += med n += 1 if total_coverage / float(n) > args.coverage: print 'reached target average coverage:', \ total_coverage / float(n) break htable.consume(seq) if args.output: args.output.write(output_single(record)) if n % 100000 == 0: print '...', index, filename, n, total_coverage / float(n) if total_coverage / float(n) > args.coverage: break print 'Collected %d reads' % (n, ) if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( htable.n_occupied()) print 'saving', base htable.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filenames[-1]) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " this data set. Increase tablesize/# tables.") print >> sys.stderr, "**" sys.exit(1) print 'DONE.'
def main(): info('extract-paired-reads.py') args = get_parser().parse_args() check_file_status(args.infile) infiles = [args.infile] check_space(infiles) outfile = os.path.basename(args.infile) if len(sys.argv) > 2: outfile = sys.argv[2] single_fp = open(outfile + '.se', 'w') paired_fp = open(outfile + '.pe', 'w') print 'reading file "%s"' % args.infile print 'outputting interleaved pairs to "%s.pe"' % outfile print 'outputting orphans to "%s.se"' % outfile last_record = None last_name = None n_pe = 0 n_se = 0 record = None index = 0 for index, record in enumerate(screed.open(sys.argv[1])): if index % 100000 == 0 and index > 0: print '...', index name = record['name'].split()[0] if last_record: if is_pair(last_name, name): paired_fp.write(output_pair(last_record, record)) name, record = None, None n_pe += 1 else: single_fp.write(output_single(last_record)) n_se += 1 last_name = name last_record = record if last_record: if is_pair(last_name, name): paired_fp.write(output_pair(last_record, record)) name, record = None, None n_pe += 1 else: single_fp.write(output_single(last_record)) name, record = None, None n_se += 1 if record: single_fp.write(output_single(record)) n_se += 1 single_fp.close() paired_fp.close() if n_pe == 0: raise Exception("no paired reads!? check file formats...") print 'DONE; read %d sequences, %d pairs and %d singletons' % \ (index + 1, n_pe, n_se) print >> sys.stderr, 'wrote to: ' + outfile \ + '.se' + ' and ' + outfile + '.pe'
def main(): info('sample-reads-randomly.py') args = get_parser().parse_args() for _ in args.filenames: check_input_files(_, args.force) check_space(args.filenames, args.force) # seed the random number generator? if args.random_seed: random.seed(args.random_seed) # bound n_samples num_samples = max(args.num_samples, 1) # # Figure out what the output filename is going to be # output_file = args.output_file if output_file: if num_samples > 1: sys.stderr.write( "Error: cannot specify -o with more than one sample.") if not args.force: sys.exit(1) output_filename = output_file.name else: filename = args.filenames[0] output_filename = os.path.basename(filename) + '.subset' if num_samples == 1: print >>sys.stderr, 'Subsampling %d reads using reservoir sampling.' %\ args.num_reads print >>sys.stderr, 'Subsampled reads will be placed in %s' % \ output_filename print >> sys.stderr, '' else: # > 1 print >>sys.stderr, 'Subsampling %d reads, %d times,' \ % (args.num_reads, num_samples), ' using reservoir sampling.' print >>sys.stderr, 'Subsampled reads will be placed in %s.N' \ % output_filename print >> sys.stderr, '' reads = [] for n in range(num_samples): reads.append([]) # read through all the sequences and load/resample the reservoir for filename in args.filenames: print >> sys.stderr, 'opening', filename, 'for reading' screed_iter = screed.open(filename, parse_description=False) for count, (_, ispair, rcrd1, rcrd2) in enumerate( broken_paired_reader(screed_iter, force_single=args.force_single)): if count % 10000 == 0: print >> sys.stderr, '...', count, 'reads scanned' if count >= args.max_reads: print >>sys.stderr, 'reached upper limit of %d reads' % \ args.max_reads, '(see -M); exiting' break # collect first N reads if count < args.num_reads: for n in range(num_samples): reads[n].append((rcrd1, rcrd2)) else: assert len(reads[n]) <= count # use reservoir sampling to replace reads at random # see http://en.wikipedia.org/wiki/Reservoir_sampling for n in range(num_samples): guess = random.randint(1, count) if guess <= args.num_reads: reads[n][guess - 1] = (rcrd1, rcrd2) # output all the subsampled reads: if len(reads) == 1: print >>sys.stderr, 'Writing %d sequences to %s' % \ (len(reads[0]), output_filename) if not output_file: output_file = open(output_filename, 'w') for records in reads[0]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file) else: for n in range(num_samples): n_filename = output_filename + '.%d' % n print >>sys.stderr, 'Writing %d sequences to %s' % \ (len(reads[n]), n_filename) output_file = open(n_filename, 'w') for records in reads[n]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file)
#! /usr/bin/env python # # This file is part of khmer, http://github.com/ged-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2013. It is licensed under # the three-clause BSD license; see doc/LICENSE.txt. Contact: [email protected] # import sys import screed import khmer K = 32 infile = sys.argv[1] ht = khmer.new_hashbits(K, 1, 1) ht.consume_partitioned_fasta(infile) for n, record in enumerate(screed.open(infile)): if n % 10000 == 0: print '... checking', n assert ht.is_single_partition(record.sequence)
def normalize_by_median(input_filename, outfp, htable, args, report_fp=None): desired_coverage = args.cutoff ksize = htable.ksize() # In paired mode we read two records at a time batch_size = 1 if args.paired: batch_size = 2 index = -1 total = 0 discarded = 0 for index, batch in enumerate(batchwise(screed.open( input_filename), batch_size)): if index > 0 and index % 100000 == 0: print '... kept {kept} of {total} or {perc:2}%'.format( kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.)) print '... in file', input_filename if report_fp: print >> report_fp, total, total - discarded, \ 1. - (discarded / float(total)) report_fp.flush() total += batch_size # If in paired mode, check that the reads are properly interleaved if args.paired: if not validpair(batch[0], batch[1]): raise IOError('Error: Improperly interleaved pairs \ {b0} {b1}'.format(b0=batch[0].name, b1=batch[1].name)) # Emit the batch of reads if any read passes the filter # and all reads are longer than K passed_filter = False passed_length = True for record in batch: if len(record.sequence) < ksize: passed_length = False continue seq = record.sequence.replace('N', 'A') med, _, _ = htable.get_median_count(seq) if med < desired_coverage: htable.consume(seq) passed_filter = True # Emit records if any passed if passed_length and passed_filter: for record in batch: if hasattr(record, 'accuracy'): outfp.write( '@{name}\n{seq}\n' '+\n{acc}\n'.format(name=record.name, seq=record.sequence, acc=record.accuracy)) else: outfp.write( '>{name}\n{seq}\n'.format(name=record.name, seq=record.sequence)) else: discarded += batch_size return total, discarded
#! /usr/bin/env python import screed import sys import random random.seed(1) # make this reproducible, please. COVERAGE = 200 READLEN = 100 ERROR_RATE = 100 record = iter(screed.open(sys.argv[1])).next() genome = record.sequence len_genome = len(genome) n_reads = int(len_genome * COVERAGE / float(READLEN)) reads_mut = 0 total_mut = 0 for i in range(n_reads): start = random.randint(0, len_genome - READLEN) read = genome[start:start + READLEN].upper() # reverse complement? if random.choice([0, 1]) == 0: read = screed.rc(read) # error? was_mut = False for _ in range(READLEN): while random.randint(1, ERROR_RATE) == 1:
def main(): p = argparse.ArgumentParser() p.add_argument("sample_id") p.add_argument("gather_csv") p.add_argument("--outdir", default="outputs") args = p.parse_args() sample_id = args.sample_id outdir = args.outdir.rstrip("/") print(f"reading gather results from {args.gather_csv}") rows = [] with open(args.gather_csv, "rt") as fp: r = csv.DictReader(fp) for row in r: rows.append(row) print(f"...loaded {len(rows)} results total.") print("checking input/output pairs:") pairs = [] fail = False for row in rows: acc = row["name"].split()[0] filename = f"{outdir}/mapping/{sample_id}.x.{acc}.mapped.fq.gz" overlapping = f"{outdir}/mapping/{sample_id}.x.{acc}.overlap.fq.gz" leftover = f"{outdir}/mapping/{sample_id}.x.{acc}.leftover.fq.gz" if not os.path.exists(filename): print( f"ERROR: input filename {filename} does not exist. Will exit.") fail = True pairs.append((acc, filename, overlapping, leftover)) if fail: print("Some required input files not found - exiting.") sys.exit(-1) ignore_reads = set() for n, (acc, filename, overlapping, leftover) in enumerate(pairs): overlap_fp = gzip.open(overlapping, "wt") leftover_fp = gzip.open(leftover, "wt") print('-' * 30) print(f"reading sequences from {filename};") print(f"writing overlapping to {overlapping}") print(f"writing remaining to {leftover}") n_wrote = 0 screed_fp = screed.open(filename) for record in screed_fp: fq = f"@{record.name}\n{record.sequence}\n+\n{record.quality}\n" if record.name in ignore_reads: overlap_fp.write(fq) else: ignore_reads.add(record.name) leftover_fp.write(fq) n_wrote += 1 screed_fp.close() print(f"wrote {n_wrote} leftover records for {sample_id}.x.{acc};") print(f"{len(ignore_reads)} total reads to ignore moving forward.") print(f"file {n+1} of {len(pairs)} total") overlap_fp.close() leftover_fp.close() # <-- here is where we can go through the input reads and output unmapped. # (OR, save 'ignore_reads' and let another script handle it.) return 0
def create_records_iter(): print('reading cDBG nodes from {}'.format(contigs_filename)) return screed.open(contigs_filename)
def main(): parser = sanitize_help(get_parser()) args = parser.parse_args() if not args.quiet: info('trim-low-abund.py', ['streaming']) configure_logging(args.quiet) ### if len(set(args.input_filenames)) != len(args.input_filenames): log_error("Error: Cannot input the same filename multiple times.") sys.exit(1) if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \ not args.variable_coverage: log_error("Error: --trim-at-coverage/-Z given, but " "--variable-coverage/-V not specified.") sys.exit(1) if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \ not args.diginorm: log_error("Error: --diginorm-coverage given, but " "--diginorm not specified.") sys.exit(1) if args.diginorm and args.single_pass: log_error("Error: --diginorm and --single-pass are incompatible!\n" "You probably want to use normalize-by-median.py instead.") sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) if args.loadgraph: log_info('loading countgraph from {graph}', graph=args.loadgraph) ct = khmer.load_countgraph(args.loadgraph) else: log_info('making countgraph') ct = khmer_args.create_countgraph(args) K = ct.ksize() tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) log_info( 'created temporary directory {temp};\n' 'use -T to change location', temp=tempdir) trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff, args.trim_at_coverage) if args.diginorm: trimmer.set_diginorm(args.diginorm_coverage) # ### FIRST PASS ### save_pass2_total = 0 written_bp = 0 written_reads = 0 # only create the file writer once if outfp is specified; otherwise, # create it for each file. if args.output: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list = [] for filename in args.input_filenames: # figure out temporary filename for 2nd pass pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) pass2fp = open(pass2filename, 'w') # construct output filenames if args.output is None: # note: this will be saved in trimfp. outfp = open(os.path.basename(filename) + '.abundtrim', 'wb') # get file handle w/gzip, bzip trimfp = get_file_writer(outfp, args.gzip, args.bzip) # record all this info pass2list.append((filename, pass2filename, trimfp)) # input file stuff: get a broken_paired reader. screed_iter = screed.open(filename) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) # main loop through the file. n_start = trimmer.n_reads save_start = trimmer.n_saved watermark = REPORT_EVERY_N_READS for read in trimmer.pass1(paired_iter, pass2fp): if (trimmer.n_reads - n_start) > watermark: log_info( "... {filename} {n_saved} {n_reads} {n_bp} " "{w_reads} {w_bp}", filename=filename, n_saved=trimmer.n_saved, n_reads=trimmer.n_reads, n_bp=trimmer.n_bp, w_reads=written_reads, w_bp=written_bp) watermark += REPORT_EVERY_N_READS # write out the trimmed/etc sequences that AREN'T going to be # revisited in a 2nd pass. write_record(read, trimfp) written_bp += len(read) written_reads += 1 pass2fp.close() log_info("{filename}: kept aside {kept} of {total} from first pass", filename=filename, kept=trimmer.n_saved - save_start, total=trimmer.n_reads - n_start) # first pass goes across all the data, so record relevant stats... n_reads = trimmer.n_reads n_bp = trimmer.n_bp n_skipped = trimmer.n_skipped bp_skipped = trimmer.bp_skipped save_pass2_total = trimmer.n_saved # ### SECOND PASS. ### # nothing should have been skipped yet! assert trimmer.n_skipped == 0 assert trimmer.bp_skipped == 0 if args.single_pass: pass2list = [] # go back through all the files again. for _, pass2filename, trimfp in pass2list: log_info('second pass: looking at sequences kept aside in {pass2}', pass2=pass2filename) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. Hence, force_single=True below. screed_iter = screed.open(pass2filename, parse_description=False) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=True) watermark = REPORT_EVERY_N_READS for read in trimmer.pass2(paired_iter): if (trimmer.n_reads - n_start) > watermark: log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}', a=trimmer.n_reads - n_start, b=pass2filename, c=trimmer.n_saved, d=trimmer.n_reads, e=trimmer.n_bp, f=written_reads, g=written_bp) watermark += REPORT_EVERY_N_READS write_record(read, trimfp) written_reads += 1 written_bp += len(read) log_info('removing {pass2}', pass2=pass2filename) os.unlink(pass2filename) # if we created our own trimfps, close 'em. if not args.output: trimfp.close() log_info('removing temp directory & contents ({temp})', temp=tempdir) shutil.rmtree(tempdir) trimmed_reads = trimmer.trimmed_reads n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp) log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp) log_info('looked at {st} reads twice ({np:.2f} passes)', st=save_pass2_total, np=n_passes) log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)', r=n_reads - written_reads, t=trimmed_reads, p=percent_reads_trimmed) log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)', p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads log_info('{n} reads were high coverage ({p:.2f}%);', n=n_reads - n_skipped, p=percent_reads_hicov) log_info('skipped {r} reads/{bp} bases because of low coverage', r=n_skipped, bp=bp_skipped) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) log_info('output in *.abundtrim') if args.savegraph: log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph) ct.save(args.savegraph)
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('catlas_prefix', help='input file') args = parser.parse_args() basename = os.path.basename(args.catlas_prefix) cdbg = os.path.join(args.catlas_prefix, 'cdbg.gxt') infp = open(cdbg, 'rt') outname = os.path.join(args.catlas_prefix, 'cdbg.gml') outfp = open(outname, 'wt') print('reading contig sizes') contigsfile = os.path.join(args.catlas_prefix, 'contigs.fa.gz') node_sizes = {} for n, record in enumerate(screed.open(contigsfile)): node_sizes[int(record.name)] = len(record.sequence) print('converting {} to {}...'.format(cdbg, outname)) writer = GmlWriter(outfp) num_nodes = int(next(infp)) for x in range(num_nodes): writer.add_vertex(x, node_sizes.get(x, 1)) for line in infp: u, v = line.split() writer.add_edge(int(u), int(v)) writer.done()