Ejemplo n.º 1
0
def test_extract_paired_reads_1_fa():
    # test input file
    infile = utils.get_test_data("paired-mixed.fa")

    ex_outfile1 = utils.get_test_data("paired-mixed.fa.pe")
    ex_outfile2 = utils.get_test_data("paired-mixed.fa.se")

    # actual output files...
    outfile1 = utils.get_temp_filename("paired-mixed.fa.pe")
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename("paired-mixed.fa.se", in_dir)

    script = scriptpath("extract-paired-reads.py")
    args = [infile]

    runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Ejemplo n.º 2
0
def test_filter_abund_single_8_retain_Ns():
    # check that filter-abund-single retains
    # sequences with Ns, and treats them as As.

    infile = utils.get_temp_filename('test.fq')
    in_dir = os.path.dirname(infile)

    # copy test file over to test.fq & load into countgraph
    shutil.copyfile(utils.get_test_data('test-filter-abund-Ns.fq'), infile)

    script = 'filter-abund-single.py'
    args = ['-k', '17', '-x', '1e7', '-N', '2', '-C', '3', infile]
    utils.runscript(script, args, in_dir)

    outfile = infile + '.abundfilt'
    assert os.path.exists(outfile), outfile

    # test for a sequence with an 'N' in it --
    names = set([r.name for r in screed.open(outfile)])
    assert '895:1:37:17593:9954 1::FOO_withN' in names, names

    # check to see if that 'N' was properly changed to an 'A'
    seqs = set([r.sequence for r in screed.open(outfile)])
    assert 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAG' not in seqs, seqs

    # ...and that an 'N' remains in the output sequences
    found_N = False
    for s in seqs:
        if 'N' in s:
            found_N = True
    assert found_N, seqs
Ejemplo n.º 3
0
def test_extract_paired_reads_2_fq():
    # test input file
    infile = utils.get_test_data('paired-mixed.fq')

    ex_outfile1 = utils.get_test_data('paired-mixed.fq.pe')
    ex_outfile2 = utils.get_test_data('paired-mixed.fq.se')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired-mixed.fq.pe')
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('paired-mixed.fq.se', in_dir)

    script = scriptpath('extract-paired-reads.py')
    args = [infile]

    runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.accuracy == q.accuracy
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.accuracy == q.accuracy
    assert n > 0
Ejemplo n.º 4
0
def test_split_paired_reads_1_fa():
    # test input file
    infile = utils.get_test_data('paired.fa')

    ex_outfile1 = utils.get_test_data('paired.fa.1')
    ex_outfile2 = utils.get_test_data('paired.fa.2')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired.fa.1')
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('paired.fa.2', in_dir)

    script = 'split-paired-reads.py'
    args = [infile]

    utils.runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Ejemplo n.º 5
0
def test_extract_partitions_fq():
    seqfile = utils.get_test_data('random-20-a.fq')
    graphbase = _make_graph(
        seqfile, do_partition=True, annotate_partitions=True)
    in_dir = os.path.dirname(graphbase)

    # get the final part file
    partfile = os.path.join(in_dir, 'random-20-a.fq.part')

    # ok, now run extract-partitions.
    script = scriptpath('extract-partitions.py')
    args = ['extracted', partfile]

    runscript(script, args, in_dir)

    distfile = os.path.join(in_dir, 'extracted.dist')
    groupfile = os.path.join(in_dir, 'extracted.group0000.fq')
    assert os.path.exists(distfile)
    assert os.path.exists(groupfile)

    dist = open(distfile).readline()
    assert dist.strip() == '99 1 1 99'

    parts = [r.name.split('\t')[1] for r in screed.open(partfile)]
    assert len(parts) == 99, len(parts)
    parts = set(parts)
    assert len(parts) == 1, len(parts)

    quals = set([r.accuracy for r in screed.open(partfile)])
    quals = list(quals)
    assert quals[0], quals
Ejemplo n.º 6
0
def test_split_paired_reads_2_mixed_fq_orphans_to_file():
    # test input file
    infile = utils.copy_test_data('paired-mixed-2.fq')
    in_dir = os.path.dirname(infile)
    outfile = utils.get_temp_filename('out.fq')

    script = 'split-paired-reads.py'
    args = ['-0', outfile, infile]

    status, out, err = utils.runscript(script, args, in_dir)
    assert status == 0
    assert "split 6 sequences (3 left, 3 right, 5 orphans)" in err, err

    n_orphans = len([1 for record in screed.open(outfile)])
    assert n_orphans == 5
    n_left = len([1 for record in screed.open(infile + '.1')])
    assert n_left == 3
    n_right = len([1 for record in screed.open(infile + '.2')])
    assert n_right == 3
    for filename in [outfile, infile + '.1', infile + '.2']:
        fp = gzip.open(filename)
        try:
            fp.read()
        except IOError as e:
            assert "Not a gzipped file" in str(e), str(e)
        fp.close()
Ejemplo n.º 7
0
def test_split_paired_reads_3_output_files_right():
    # test input file
    infile = utils.get_test_data('paired.fq')

    ex_outfile1 = utils.get_test_data('paired.fq.1')
    ex_outfile2 = utils.get_test_data('paired.fq.2')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired.fq.1')
    output_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('yyy', output_dir)

    script = 'split-paired-reads.py'
    args = ['-2', outfile2, '-d', output_dir, infile]

    utils.runscript(script, args)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.quality == q.quality
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.quality == q.quality
    assert n > 0
Ejemplo n.º 8
0
def test_interleave_read_stdout():
    # create input files
    infile1 = utils.get_test_data('paired-slash1.fq.1')
    infile2 = utils.get_test_data('paired-slash1.fq.2')

    # correct output
    ex_outfile = utils.get_test_data('paired-slash1.fq')

    # actual output file
    outfile = utils.get_temp_filename('out.fq')

    script = 'interleave-reads.py'
    args = [infile1, infile2]

    (stats, out, err) = utils.runscript(script, args)

    with open(outfile, 'w') as ofile:
        ofile.write(out)

    n = 0
    for r, q in zip(screed.open(ex_outfile), screed.open(outfile)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Ejemplo n.º 9
0
def test_normalize_by_median_paired_fq():
    CUTOFF = '20'

    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-paired.fq'), infile)

    script = 'normalize-by-median.py'
    args = ['-C', CUTOFF, '-p', '-k', '17', infile]
    _, out, err = utils.runscript(script, args, in_dir)
    print(out)
    print(err)

    outfile = infile + '.keep'
    assert os.path.exists(outfile), outfile

    seqs = [r.sequence for r in screed.open(outfile)]
    assert len(seqs) == 6, len(seqs)
    assert seqs[0].startswith('GGTTGACGGGGCTCAGGGGG'), seqs
    assert seqs[1].startswith('GGTTGACGGGGCTCAGGG'), seqs

    names = [r.name for r in screed.open(outfile, parse_description=False)]
    assert len(names) == 6, names
    assert '895:1:37:17593:9954 1::FOO' in names, names
    assert '895:1:37:17593:9954 2::FOO' in names, names
Ejemplo n.º 10
0
def main():
    dbfile = sys.argv[1]
    mapfile = sys.argv[2]

    lengths = {}
    for n, record in enumerate(screed.open(dbfile)):
        if n % 100000 == 0:
            print('...', n)
        lengths[record.name] = len(record.sequence)

    sums = {}
    for n, line in enumerate(open(mapfile)):
        if n % 100000 == 0:
            print('... 2x', n)
        x = line.split('\t')
        name = x[2]
        readlen = len(x[4])
        sums[name] = sums.get(name, 0) + 1

    mapped_reads = n

    rpkms = {}
    for k in sums:
        rpkms[k] = sums[k] * (1000. / float(lengths[k])) * \
            float(mapped_reads) / 1e6

    outfp = open(dbfile + '.cov', 'w')
    for n, record in enumerate(screed.open(dbfile)):
        if n % 100000 == 0:
            print('...', n)

        print(">%s[cov=%d]\n%s" % (record.name,
                                   rpkms.get(record.name, 0),
                                   record.sequence),
              file=outfp)
Ejemplo n.º 11
0
def test_extract_paired_reads_3_output_dir():
    # test input file
    infile = utils.get_test_data('paired-mixed.fa')

    ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe')
    ex_outfile2 = utils.get_test_data('paired-mixed.fa.se')

    # output directory
    out_dir = utils.get_temp_filename('output')

    script = 'extract-paired-reads.py'
    args = [infile, '-d', out_dir]

    utils.runscript(script, args)

    outfile1 = os.path.join(out_dir, 'paired-mixed.fa.pe')
    outfile2 = os.path.join(out_dir, 'paired-mixed.fa.se')
    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Ejemplo n.º 12
0
def fixseqs(filein, fileout):

    fw = open(fileout, 'w')
    fi = open(filein, 'r')
    line1 = fi.readline()
    
    #Does line 1 correspond to FASTA?
    if line1[0] == '>':
        for n, record in enumerate(screed.open(filein)):
            name = record['name']
            sequence = record['sequence']
            fw.write('>%s\n%s\n' % (name, sequence))
            #print name, "\n", sequence
    #Does line 1 correspond to FASTQ?
    elif line1[0] == '@':
        for n, record in enumerate(screed.open(filein)):
            if 'N' in record['annotations']:
                name = record['name'] + ' ' + record['annotations']
                sequence = record['sequence']
                accuracy = record['accuracy']
                fw.write('@%s\n%s\n+\n%s\n' % (name, sequence, accuracy))
    #No FASTA or FASTQ file provided
    else:
        print 'Neither fasta nor fastq input. Do your headers start with\n\
                > (fasta) or @ (fastq)?'
    fw.close()
Ejemplo n.º 13
0
def test_normalize_by_median_paired_fq():
    CUTOFF = "20"

    infile = utils.get_temp_filename("test.fa")
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data("test-abund-read-paired.fq"), infile)

    script = "normalize-by-median.py"
    args = ["-C", CUTOFF, "-p", "-k", "17", infile]
    _, out, err = utils.runscript(script, args, in_dir)
    print(out)
    print(err)

    outfile = infile + ".keep"
    assert os.path.exists(outfile), outfile

    seqs = [r.sequence for r in screed.open(outfile)]
    assert len(seqs) == 6, len(seqs)
    assert seqs[0].startswith("GGTTGACGGGGCTCAGGGGG"), seqs
    assert seqs[1].startswith("GGTTGACGGGGCTCAGGG"), seqs

    names = [r.name for r in screed.open(outfile)]
    assert len(names) == 6, names
    assert "895:1:37:17593:9954 1::FOO" in names, names
    assert "895:1:37:17593:9954 2::FOO" in names, names
Ejemplo n.º 14
0
def test_split_paired_reads_2_mixed_fq_gzfile():
    # test input file
    infile = utils.get_temp_filename('test.fq')
    shutil.copyfile(utils.get_test_data('paired-mixed-2.fq'), infile)
    in_dir = os.path.dirname(infile)
    outfile = utils.get_temp_filename('out.fq')

    script = 'split-paired-reads.py'
    args = ['-0', outfile, '--gzip', infile]

    status, out, err = utils.runscript(script, args, in_dir)
    assert status == 0
    assert "split 6 sequences (3 left, 3 right, 5 orphans)" in err, err

    n_orphans = len([1 for record in screed.open(outfile)])
    assert n_orphans == 5
    n_left = len([1 for record in screed.open(infile + '.1')])
    assert n_left == 3
    n_right = len([1 for record in screed.open(infile + '.2')])
    assert n_right == 3

    for filename in [outfile, infile + '.1', infile + '.2']:
        fp = gzip.open(filename)
        fp.read()                       # this will fail if not gzip file.
        fp.close()
Ejemplo n.º 15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('genomes')
    parser.add_argument('reads')
    args = parser.parse_args()

    # build a counting label hash + readaligner.
    lh = khmer.CountingLabelHash(21, 1e7, 4)
    lh.consume_fasta_and_tag_with_labels(args.genomes)
    aligner = khmer.ReadAligner(lh.graph, 1, 1.0)

    names = []
    # (labels in 'lh' are in the order of the sequences in the file)
    for grec in screed.open(args.genomes):
        names.append(grec.name)

    print 'loaded two references:', names

    # run through all the reads, align, and use alignments to look up
    # the label.
    for record in screed.open(args.reads):
        # build alignments against cg
        _, ga, ra, truncated = aligner.align(record.sequence)

        if truncated:
            print 'NO MATCHES', record.name
        else:
            # now grab the associated labels
            labels = lh.sweep_label_neighborhood(ga)

            # print out the matches.
            matches = set([ names[i] for i in labels ])
            print record.name, 'matches to', ", ".join(matches)
Ejemplo n.º 16
0
def main():
    if len(sys.argv) < 2:
        sys.stderr.write('*** Usage: python {} <seqfile>\n'.format(
            os.path.basename(sys.argv[0])))

        sys.exit(1)

    seqfile = sys.argv[1]
    d = OrderedDict()
    for rec in screed.open(seqfile):
        name = rec.name.split(None, 1)[0]
        seq = rec.sequence
        if name.endswith('/1') or name.endswith('/2'):
            name2 = name[:-2]
            if name2 in d:
                if d[name2][0] > len(seq):
                    continue

            d[name2] = len(seq), name
        else:
            d[name] = len(seq), name

    st = set([d[name][-1] for name in d])
    for rec in screed.open(seqfile):
        name = rec.name.split(None, 1)[0]
        if name in st:
            sys.stdout.write('>{}\n{}\n'.format(name, rec.sequence))
Ejemplo n.º 17
0
def test_filter_abund_1():
    script = 'filter-abund.py'

    infile = utils.copy_test_data('test-abund-read-2.fa')
    n_infile = utils.copy_test_data('test-fastq-n-reads.fq')

    in_dir = os.path.dirname(infile)
    n_in_dir = os.path.dirname(n_infile)

    counting_ht = _make_counting(infile, K=17)
    n_counting_ht = _make_counting(n_infile, K=17)

    args = [counting_ht, infile]
    utils.runscript(script, args, in_dir)

    outfile = infile + '.abundfilt'
    n_outfile = n_infile + '.abundfilt'
    n_outfile2 = n_infile + '2.abundfilt'

    assert os.path.exists(outfile), outfile

    seqs = set([r.sequence for r in screed.open(outfile)])

    assert len(seqs) == 1, seqs
    assert 'GGTTGACGGGGCTCAGGG' in seqs

    args = [n_counting_ht, n_infile]
    utils.runscript(script, args, n_in_dir)

    seqs = set([r.sequence for r in screed.open(n_infile)])
    assert os.path.exists(n_outfile), n_outfile

    args = [n_counting_ht, n_infile, '-o', n_outfile2]
    utils.runscript(script, args, in_dir)
    assert os.path.exists(n_outfile2), n_outfile2
Ejemplo n.º 18
0
def test_extract_paired_reads_4_output_files():
    # test input file
    infile = utils.get_test_data('paired-mixed.fa')

    ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe')
    ex_outfile2 = utils.get_test_data('paired-mixed.fa.se')

    # actual output files...
    outfile1 = utils.get_temp_filename('out_pe')
    outfile2 = utils.get_temp_filename('out_se')

    script = 'extract-paired-reads.py'
    args = [infile, '-p', outfile1, '-s', outfile2]

    utils.runscript(script, args)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Ejemplo n.º 19
0
def test_gz_open_fastq():
    filename1 = os.path.join(os.path.dirname(__file__), 'test.fastq')
    filename2 = os.path.join(os.path.dirname(__file__), 'test.fastq.gz')
    for n, (r1, r2) in enumerate(zip(screed.open(filename1),
                                     screed.open(filename2))):
        assert r1.name == r2.name

    assert n > 0
Ejemplo n.º 20
0
def test_gz_open():
    filename1 = utils.get_test_data('test.fa')
    filename2 = utils.get_test_data('test.fa.gz')
    with screed.open(filename1) as f1, screed.open(filename2) as f2:
        for n, (r1, r2) in enumerate(zip(f1, f2)):
            assert r1.name == r2.name

        assert n > 0
Ejemplo n.º 21
0
def test_bz2_open():
    filename1 = utils.get_test_data('test.fa')
    filename2 = utils.get_test_data('test.fa.bz2')
    for n, (r1, r2) in enumerate(zip(screed.open(filename1),
                                     screed.open(filename2))):
        assert r1.name == r2.name

    assert n > 0
Ejemplo n.º 22
0
def test_gz_open_fastq():
    filename1 = utils.get_test_data('test.fastq')
    filename2 = utils.get_test_data('test.fastq.gz')
    for n, (r1, r2) in enumerate(zip(screed.open(filename1),
                                     screed.open(filename2))):
        assert r1.name == r2.name

    assert n > 0
Ejemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('prefix')
    parser.add_argument('transcripts_file')

    args = parser.parse_args()

    prefix = args.prefix
    filename = args.transcripts_file

    # first pass: count partition sizes
    partition_sizes = {}
    for n, record in enumerate(screed.open(filename, parse_description=0)):
        if n % 10000 == 0:
            print '...', n
        partition = record.name.split()[-1]
        partition_sizes[partition] = partition_sizes.get(partition, 0) + 1

    # show top 10 biggest partitions
    print '---------------'
    print 'partition, size'
    for n, (_, size) in enumerate(sorted(partition_sizes.items(),
                                         key=lambda x: -x[1])):
        print n, size
        if n == 10:
            break
    print '---------------'

    # now, make a sensible header for each sequence that uniquely ids it
    partition_sofar = {}
    seq_id = 1

    new_filename = os.path.basename(filename)
    if new_filename.endswith('.gz'):
        new_filename = new_filename[:-3]
    if new_filename.endswith('.fasta'):
        new_filename = new_filename[:-6]
    new_filename += '.renamed.fasta.gz'

    print 'creating', new_filename
    outfp = gzip.open(new_filename, 'wb')

    for n, record in enumerate(screed.open(sys.argv[2], parse_description=0)):
        if n % 10000 == 0:
            print '...writing', n
        partition = record.name.split()[-1]
        sofar = partition_sofar.get(partition, 0) + 1
        partition_sofar[partition] = sofar
        partition_size = partition_sizes[partition]

        new_name = '%s.id%d.tr%s %d_of_%d_in_tr%s len=%d id=%s tr=%s' % \
            (prefix, seq_id, partition, sofar, partition_size, partition, len(record.sequence), seq_id, partition)
        outfp.write('>%s\n%s\n' % (new_name, record.sequence))
        seq_id += 1

    print 'total sequences:', n+1
    print 'total transcript families:', len(partition_sizes)
Ejemplo n.º 24
0
def main():
    info('interleave-reads.py')
    args = get_parser().parse_args()

    for _ in args.infiles:
        check_file_status(_, args.force)

    check_space(args.infiles, args.force)

    s1_file = args.infiles[0]
    if len(args.infiles) == 2:
        s2_file = args.infiles[1]
    else:
        s2_file = s1_file.replace('_R1_', '_R2_')
        print >> sys.stderr, ("given only one file; "
                              "guessing that R2 file is %s" % s2_file)

    fail = False
    if not os.path.exists(s1_file):
        print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file
        fail = True

    if not os.path.exists(s2_file):
        print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file
        fail = True

    if fail and not args.force:
        sys.exit(1)

    print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file)

    counter = 0
    for read1, read2 in itertools.izip(screed.open(s1_file),
                                       screed.open(s2_file)):
        if counter % 100000 == 0:
            print >> sys.stderr, '...', counter, 'pairs'
        counter += 1

        name1 = read1.name
        if not name1.endswith('/1'):
            name1 += '/1'
        name2 = read2.name
        if not name2.endswith('/2'):
            name2 += '/2'

        assert name1[:-2] == name2[:-2], \
            "This doesn't look like paired data! %s %s" % (name1, name2)

        read1.name = name1
        read2.name = name2
        write_record(read1, args.output)
        write_record(read2, args.output)

    print >> sys.stderr, 'final: interleaved %d pairs' % counter

    print >> sys.stderr, 'output written to', args.output
Ejemplo n.º 25
0
def main():
    parser = argparse.ArgumentParser(description="Get reads coverage matrix")
    
    parser.add_argument('hashname1')
    parser.add_argument('hashname2')
    parser.add_argument('file1')
    parser.add_argument('file2')
    parser.add_argument('output')

    args = parser.parse_args()
    hashname1 = args.hashname1
    hashname2 = args.hashname2
    output  = args.output
    file1 = args.file1
    file2 = args.file2
    outfp = open(output, 'w')

    print 'hashtable from', hashname1
    ht1 = khmer.load_counting_hash(hashname1)
    ht2 = khmer.load_counting_hash(hashname2)
    
    matrix1 = {}
    matrix2 = {}
    set_x = set()
    set_y = set()
    for n, record in enumerate(screed.open(file1)):
        if n > 0 and n % 100000 == 0:#100000
            print '...', n, file1
        seq = record.sequence.replace('N', 'A')
        med1, _, _ = ht1.get_median_count(seq)
        set_x.add(med1)
        med2, _, _ = ht2.get_median_count(seq)
        set_y.add(med2)
        key = str(med1)+'-'+str(med2)
        matrix1[key] = matrix1.get(key,0) + 1

    for n, record in enumerate(screed.open(file2)):
        if n > 0 and n % 100000 == 0:#100000
            print '...', n, file2
        seq = record.sequence.replace('N', 'A')
        med1, _, _ = ht1.get_median_count(seq)
        set_x.add(med1)
        med2, _, _ = ht2.get_median_count(seq)
        set_y.add(med2)
        key = str(med1)+'-'+str(med2)
        matrix2[key] = matrix2.get(key,0) + 1

    for x in range(max(list(set_x))):
        for y in range(max(list(set_y))):
            to_print = str(x)+'-'+str(y)+' '+ \
            str(matrix1.get(str(x)+'-'+str(y),0))+' '+ \
            str(matrix2.get(str(x)+'-'+str(y),0))+' '+ \
            str(matrix1.get(str(x)+'-'+str(y),0)+matrix2.get(str(x)+'-'+str(y),0))+'\n'
            
            outfp.write(to_print)
    outfp.close()
Ejemplo n.º 26
0
def read_interleaved_or_paired(fq1, fq2=None):
    if fq2:
        r1s = screed.open(fq1)
        r2s = screed.open(fq2)
        for r1, r2 in zip(r1s, r2s):
            yield (r1, r2)
    else:
        reads = screed.open(fq1)
        for r1, r2 in zip(reads, reads):
            yield (r1, r2)
def count_overlap(K,HT_SIZE,N_HT,filename,filename2,file_result,file_curve):

    if file_curve !='N':
        count = 0
        for n, record in enumerate(screed.open(filename2)):
            count = count+1
        max_count = count/100
        file3 = open(file_curve,'w')
        
    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
    n_unique = 0
    for n, record in enumerate(screed.open(filename)):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0,seq_len+1-K):
            kmer = sequence[n:n+K]
            if (not ht.get(kmer)):
                n_unique+=1
            ht.count(kmer)
    print filename,'has been consumed.'
    fpr = (1- math.exp(-n_unique/HT_SIZE))**Z
    printout1 = "%s:\n# of unique kmers: %n\n# of occupied bin: %n\nfalse positive\
rate: %n" %(filename,n_unique,ht.n_occupied(),fpr)
# consume second dataset
    ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT)
    n_unique = 0
    n_overlap = 0
    seq_count = 0
    for n, record in enumerate(screed.open(filename2)):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0,seq_len+1-K):
            kmer = sequence[n:n+K]
            if (not ht2.get(kmer)):
                n_unique+=1
                if (ht.get(kmer)):
                    n_overlap+=1
            ht2.count(kmer)
        if file_curve !='N':
            seq_count = seq_count + 1
            if seq_count == max_count:
                #n_occu = ht2.n_occupied
                string = str(n_unique)+' '+str(n_overlap)+'\n'
                file3 = open(file_curve,'a')
                file3.write(string)
                file3.close()
                seq_count = 0
    print filename2,'has been consumed.'
    fpr = (1- math.exp(-n_unique/HT_SIZE))**Z
    printout2 = "%s:\n# of unique k-mers: %n\n# of occupied bin: %n\nfalse \
positive rate: %n\n===============\n# of overlap unique k-mers: %n\n" \
%(filename2,n_unique,ht2.n_occupied(),n_overlap)
    file_result_object = open(file_result,'w')
    file_result_object.write(printout1)
    file_result_object.write(printout2)
Ejemplo n.º 28
0
def count_median(K,HT_SIZE,N_HT,filename,fileout):

    count = 0
    for n, record in enumerate(screed.open(filename)):
        count = count+1
    max_count = count/20
    print max_count

    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
    ht.set_use_bigcount(True)
#    seq_array = []
    seq_count = 0
    median_array = [6,7,8,9,10,11,12]
    med={}
    
    for median in median_array:
        med[median] = 0
    #print med
    count = 0
    for n, record in enumerate(screed.open(filename)):
        sequence = record['sequence']
        ht.consume(sequence)
#        seq_array.append(sequence)
        seq_count = seq_count + 1
        if seq_count == max_count:
            count = count+1
            number_of_sequence_consumed = max_count*count
            counted_sequence = 0
            #print number_of_sequence_consumed
            for n2,record2 in enumerate(screed.open(filename)):
                counted_sequence = counted_sequence+1
                sequence2 = record2['sequence']
                #print sequence2
#for seq in seq_array:
                a, b, c = ht.get_median_count(sequence2)
                #print a,b,c
                for median in median_array:
                    if a == median:
                        #print "hit!"
                        med[a] = med[a]+1
                if counted_sequence == number_of_sequence_consumed:
                    break
                
            #print med
            fileout_obj = open(fileout,'a')
            print_line = str(number_of_sequence_consumed)
            for median in median_array:
                print_line = print_line+ '\t'+str(med[median])+'\t'
            print_line = print_line+'\n'
            fileout_obj.write(print_line)
            fileout_obj.close()
            seq_count = 0
            med={}
            for median in median_array:
                med[median] = 0
Ejemplo n.º 29
0
def main():
    parser = argparse.ArgumentParser(
        description='Produce interleaved files from R1/R2 paired files')

    parser.add_argument('infiles', nargs='+')
    parser.add_argument('-o', '--output',
                        dest='output', type=argparse.FileType('w'),
                        default=sys.stdout)
    args = parser.parse_args()

    s1_file = args.infiles[0]
    if len(args.infiles) == 2:
        s2_file = args.infiles[1]
    else:
        s2_file = s1_file.replace('_R1_', '_R2_')
        print >>sys.stderr, "given only one file;"
        " guessing that R2 file is %s" % s2_file

    fail = False
    if not os.path.exists(s1_file):
        print >>sys.stderr, "Error! R1 file %s does not exist" % s1_file
        fail = True

    if not os.path.exists(s2_file):
        print >>sys.stderr, "Error! R2 file %s does not exist" % s2_file
        fail = True

    if fail:
        sys.exit(-1)

    print >>sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file)

    n = 0
    for r1, r2 in itertools.izip(screed.open(s1_file), screed.open(s2_file)):
        if n % 100000 == 0:
            print >>sys.stderr, '...', n, 'pairs'
        n += 1

        name1 = r1.name
        if not name1.endswith('/1'):
            name1 += '/1'
        name2 = r2.name
        if not name2.endswith('/2'):
            name2 += '/2'

        assert name1[:-2] == name2[:-
                                   2], "This doesn't look like paired data!"
        " %s %s" % (name1, name2)

        r1.name = name1
        r2.name = name2
        args.output.write(output_pair(r1, r2))

    print >>sys.stderr, 'final: interleaved %d pairs' % n
Ejemplo n.º 30
0
def test_sample_reads_randomly_S():
    infile = utils.get_temp_filename('test.fq')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile)

    script = scriptpath('sample-reads-randomly.py')

    # fix random number seed for reproducibility
    args = ['-N', '10', '-R', '1', '-S', '3']

    badargs = list(args)
    badargs.extend(['-o', 'test', 'test.fq', 'test.fq'])
    (status, out, err) = runscript(script, badargs, in_dir, fail_ok=True)
    assert status == -1, (status, out, err)

    args.append('test.fq')

    runscript(script, args, in_dir)

    outfile = infile + '.subset.0'
    assert os.path.exists(outfile), outfile

    seqs = set([r.name for r in screed.open(outfile)])
    print seqs
    assert seqs == set(['895:1:1:1298:13380', '895:1:1:1347:3237',
                        '895:1:1:1295:6189', '895:1:1:1342:11001',
                        '895:1:1:1252:19493', '895:1:1:1318:10532',
                        '895:1:1:1314:10430', '895:1:1:1347:8723',
                        '895:1:1:1381:4958', '895:1:1:1338:6614'])

    outfile = infile + '.subset.1'
    assert os.path.exists(outfile), outfile

    seqs = set([r.name for r in screed.open(outfile)])
    print seqs
    assert seqs == set(['895:1:1:1384:20217', '895:1:1:1347:3237',
                        '895:1:1:1348:18672', '895:1:1:1290:11501',
                        '895:1:1:1386:7536', '895:1:1:1373:13994',
                        '895:1:1:1355:13535', '895:1:1:1303:6251',
                        '895:1:1:1381:4958', '895:1:1:1338:6614'])

    outfile = infile + '.subset.2'
    assert os.path.exists(outfile), outfile

    seqs = set([r.name for r in screed.open(outfile)])
    print seqs
    assert seqs == set(['895:1:1:1326:7273', '895:1:1:1384:20217',
                        '895:1:1:1347:3237', '895:1:1:1353:6642',
                        '895:1:1:1340:19387', '895:1:1:1252:19493',
                        '895:1:1:1381:7062', '895:1:1:1383:3089',
                        '895:1:1:1342:20695', '895:1:1:1303:6251'])
def main(contig1, contig2):
    ht = count(iterseq(screed.open(contig1)), iterseq(screed.open(contig2)))
Ejemplo n.º 32
0
def read_partition_file(filename):
    for record_index, record in enumerate(
            screed.open(filename, parse_description=False)):
        _, partition_id = record.name.rsplit('\t', 1)
        yield record_index, record, int(partition_id)
Ejemplo n.º 33
0
def watch(args):
    "Build a signature from raw FASTA/FASTQ coming in on stdin, search."

    parser = SourmashArgumentParser()
    parser.add_argument('sbt_name', help='name of SBT to search')
    parser.add_argument('inp_file', nargs='?', default='/dev/stdin')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='suppress non-error output')
    parser.add_argument('-o',
                        '--output',
                        type=argparse.FileType('wt'),
                        help='save signature generated from data here')
    parser.add_argument('--threshold',
                        default=0.05,
                        type=float,
                        help='minimum threshold for matches (default=0.05)')
    parser.add_argument(
        '--input-is-protein',
        action='store_true',
        help='Consume protein sequences - no translation needed')
    sourmash_args.add_construct_moltype_args(parser)
    parser.add_argument(
        '-n',
        '--num-hashes',
        type=int,
        default=DEFAULT_N,
        help='number of hashes to use in each sketch (default: %(default)i)')
    parser.add_argument('--name',
                        type=str,
                        default='stdin',
                        help='name to use for generated signature')
    sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K)
    args = parser.parse_args(args)
    set_quiet(args.quiet)

    if args.input_is_protein and args.dna:
        notify('WARNING: input is protein, turning off nucleotide hashing.')
        args.dna = False
        args.protein = True

    if args.dna and args.protein:
        notify('ERROR: cannot use "watch" with both nucleotide and protein.')

    if args.dna:
        moltype = 'DNA'
        is_protein = False
        dayhoff = False
    elif args.protein:
        moltype = 'protein'
        is_protein = True
        dayhoff = False
    else:
        moltype = 'dayhoff'
        is_protein = True
        dayhoff = True

    tree = load_sbt_index(args.sbt_name)

    # check ksize from the SBT we are loading
    ksize = args.ksize
    if ksize is None:
        leaf = next(iter(tree.leaves()))
        tree_mh = leaf.data.minhash
        ksize = tree_mh.ksize

    E = MinHash(ksize=ksize,
                n=args.num_hashes,
                is_protein=is_protein,
                dayhoff=dayhoff)
    streamsig = sig.SourmashSignature(E, filename='stdin', name=args.name)

    notify('Computing signature for k={}, {} from stdin', ksize, moltype)

    def do_search():
        search_fn = SearchMinHashesFindBest().search

        results = []
        for leaf in tree.find(search_fn, streamsig, args.threshold):
            results.append((streamsig.similarity(leaf.data), leaf.data))

        return results

    notify('reading sequences from stdin')
    screed_iter = screed.open(args.inp_file)
    watermark = WATERMARK_SIZE

    # iterate over input records
    n = 0
    for n, record in enumerate(screed_iter):
        # at each watermark, print status & check cardinality
        if n >= watermark:
            notify('\r... read {} sequences', n, end='')
            watermark += WATERMARK_SIZE

            if do_search():
                break

        if args.input_is_protein:
            E.add_protein(record.sequence)
        else:
            E.add_sequence(record.sequence, False)

    results = do_search()
    if not results:
        notify('... read {} sequences, no matches found.', n)
    else:
        results.sort(key=lambda x: -x[0])  # take best
        similarity, found_sig = results[0]
        print_results('FOUND: {}, at {:.3f}', found_sig.name(), similarity)

    if args.output:
        notify('saving signature to {}', args.output.name)
        sig.save_signatures([streamsig], args.output)
Ejemplo n.º 34
0
import sys
import screed

with open('../../../bagel-main/roster.csv') as roster:
    haves = [line.split(',')[1] for line in roster.readlines()]
    #print haves

print 'mutant_label,oligo_label,sequence,scale,purification'

for line in sys.stdin:
    if line.strip() not in haves:
        for record in screed.open('../../oligos/{}.fasta'.format(
                line.strip())):
            print '{0},{0},{1},25nm,standard'.format(line.strip(),
                                                     record.sequence)
Ejemplo n.º 35
0
def main():
    parser = argparse.ArgumentParser(description='XXX')

    env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K)
    env_n_hashes = os.environ.get('KHMER_N_HASHES', DEFAULT_N_HT)
    env_hashsize = os.environ.get('KHMER_MIN_HASHSIZE', DEFAULT_MIN_HASHSIZE)

    parser.add_argument('--ksize',
                        '-k',
                        type=int,
                        dest='ksize',
                        default=env_ksize,
                        help='k-mer size to use')
    parser.add_argument('--n_hashes',
                        '-N',
                        type=int,
                        dest='n_hashes',
                        default=env_n_hashes,
                        help='number of hash tables to use')
    parser.add_argument('--hashsize',
                        '-x',
                        type=float,
                        dest='min_hashsize',
                        default=env_hashsize,
                        help='lower bound on hashsize to use')

    parser.add_argument("--trusted-cov",
                        dest="trusted_cov",
                        type=int,
                        default=DEFAULT_CUTOFF)
    parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0)

    parser.add_argument('--normalize-to',
                        '-Z',
                        type=int,
                        dest='normalize_to',
                        help='base cutoff on median k-mer abundance of this',
                        default=DEFAULT_NORMALIZE_LIMIT)

    parser.add_argument('--tempdir',
                        '-T',
                        type=str,
                        dest='tempdir',
                        default='./')

    parser.add_argument('input_filenames', nargs='+')
    args = parser.parse_args()

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    NORMALIZE_LIMIT = args.normalize_to

    print('making hashtable')
    ht = khmer.CountingHash(K, HT_SIZE, N_HT)

    aligner = khmer.ReadAligner(ht, args.trusted_cov, args.bits_theta)

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print('created temporary directory %s; use -T to change location' %
          tempdir)

    ###

    save_pass2 = 0
    n_aligned = 0
    n_corrected = 0
    total_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        corrfilename = os.path.basename(filename) + '.corr'

        pass2list.append((filename, pass2filename, corrfilename))

        pass2fp = open(pass2filename, 'w')
        corrfp = open(corrfilename, 'w')

        for n, read in enumerate(screed.open(filename)):
            total_reads += 1

            if n % 10000 == 0:
                print('...', n, filename, n_aligned, n_corrected, save_pass2, \
                      total_reads)
            seq = read.sequence.replace('N', 'A')

            # build the alignment...
            score, graph_alignment, read_alignment, truncated = \
                aligner.align(read.sequence)

            # next, decide whether or to keep it.
            output_corrected = False
            if not truncated:
                n_aligned += 1

                # build a better sequence -- this is the corrected one.
                if True:
                    graph_seq = graph_alignment.replace("-", "")
                else:
                    graph_seq = ""
                    for i in range(len(graph_alignment)):
                        if graph_alignment[i] == "-":
                            graph_seq += read_alignment[i]
                        else:
                            graph_seq += graph_alignment[i]

                corrected = graph_seq
                if graph_seq != read.sequence:
                    n_corrected += 1

                # get the minimum count for this new sequence
                mincount = ht.get_min_count(graph_seq)
                if mincount < args.normalize_to:
                    output_corrected = True

            # has this portion of the graph saturated? if not,
            # consume & save => pass2.
            if output_corrected:
                corrfp.write(output_single(read, corrected))
            else:  # uncorrected...
                ht.consume(read.sequence)
                pass2fp.write(output_single(read, read.sequence))
                save_pass2 += 1

        pass2fp.close()
        corrfp.close()

        print('%s: kept aside %d of %d from first pass, in %s' % \
              (filename, save_pass2, n, filename))
        print('aligned %d of %d reads so far' % (n_aligned, total_reads))
        print('changed %d of %d reads so far' % (n_corrected, total_reads))

    for orig_filename, pass2filename, corrfilename in pass2list:
        print('second pass: looking at sequences kept aside in %s' % \
              pass2filename)
        for n, read in enumerate(screed.open(pass2filename)):
            if n % 10000 == 0:
                print('... x 2', n, pass2filename, n_aligned, n_corrected, \
                      total_reads)

            corrfp = open(corrfilename, 'a')

            # build the alignment...
            score, graph_alignment, read_alignment, truncated = \
                aligner.align(read.sequence)

            if truncated:  # no good alignment; output original
                corrected = read.sequence
            else:
                n_aligned += 1
                # build a better sequence -- this is the corrected one.
                if True:
                    graph_seq = graph_alignment.replace("-", "")
                else:
                    graph_seq = ""
                    for i in range(len(graph_alignment)):
                        if graph_alignment[i] == "-":
                            graph_seq += read_alignment[i]
                        else:
                            graph_seq += graph_alignment[i]

                corrected = graph_seq
                if corrected != read.sequence:
                    n_corrected += 1

            corrfp.write(output_single(read, corrected))

        print('removing %s' % pass2filename)
        os.unlink(pass2filename)

    print('removing temp directory & contents (%s)' % tempdir)
    shutil.rmtree(tempdir)

    print('Aligned %d of %d total' % (n_aligned, total_reads))
    print('Changed %d of %d total' % (n_corrected, total_reads))
Ejemplo n.º 36
0
def main():
    if len(sys.argv) != 4:
        mes = '*** python {} size <check-kmer-distance-py-output> <contigs.fa>'
        print(mes.format(os.path.basename(sys.argv[0])), file=sys.stderr)
        sys.exit(1)

    size = int(sys.argv[1])
    infile = sys.argv[2]
    contigf = sys.argv[3]

    d = parse_kmer_distance(infile)

    print(('#contig_name\tcontig_len\tf_start\tr_start\tf_seq\tf_tm\tf_gc\t'
           'r_seq\tr_tm\tr_gc\tta\tamp_size'))
    pair_pass = 0
    for rec in screed.open(contigf):
        name = rec.name
        if not name in d:
            continue
        seq = rec.sequence

        for f_p, r_p in d[name]:
            assert len(seq) > r_p, '*** seq length < forward primer position'
            f = seq[f_p:(f_p + size)]
            r = RC(seq[r_p:(r_p + size)])

            # primer3 functions only accept byte-strings
            f = f.encode('utf-8')
            #f = bytes(f, 'utf-8')
            r = r.encode('utf-8')
            #r = bytes(r, 'utf-8')
            if has_ambiguous(f) or has_ambiguous(r):
                continue

            # check tm
            f_tm = primer3.calcTm(f)
            if f_tm < TM_LOWER or f_tm > TM_UPPER:
                continue
            r_tm = primer3.calcTm(r)
            if r_tm < TM_LOWER or r_tm > TM_UPPER:
                continue
            if abs(f_tm - r_tm) > TM_DIFF_MAX:
                continue

            # check gc
            f_gc = check_GC(f)
            if f_gc < GC_LOWER or f_gc > GC_UPPER:
                continue
            r_gc = check_GC(r)
            if r_gc < GC_LOWER or r_gc > GC_UPPER:
                continue

            amp = seq[f_p:(r_p + size)].encode('utf-8')
            amp_tm = primer3.calcTm(amp)
            ta = 0.3 * min(f_tm, r_tm) + 0.7 * amp_tm - 14.9  # premierbiosoft
            #ta = 0.3*min(f_tm,r_tm) + 0.7*amp_tm - 25 # IDT recommendation
            ### thermodynamics check
            ### skipping here as loose filter
            # check hairpin and homodimer

            if SS:
                f_hp = primer3.calcHairpin(f)
                f_ho = primer3.calcHomodimer(f)

                if f_hp.dg < HP_DG_LIMIT or f_hp.dg > 0:
                    continue
                if f_hp.tm > ta:
                    continue
                if f_ho.dg < DI_DG_LIMIT or f_ho.dg > 0:
                    #print('+++++>', f_ho.dg)
                    continue

                r_hp = primer3.calcHairpin(r)
                r_ho = primer3.calcHomodimer(r)
                if r_hp.dg < HP_DG_LIMIT or r_ho.dg > 0:
                    continue
                if r_hp.tm > ta:
                    continue
                if r_ho.dg < DI_DG_LIMIT or r_ho.dg > 0:
                    #print('=====>', r_ho.dg)
                    continue

                # check heterodimer
                hetero = primer3.calcHeterodimer(f, r)
                if hetero.dg < DI_DG_LIMIT:
                    continue

            pair_pass += 1
            # forward, f_tm, f_gc, reverse, r_tm, r_gc, ta, amp_size
            mes = ('{}\t{}\t{}\t{}\t{}\t{:.1f}\t{:.2f}\t{}\t{:.1f}\t'
                   '{:.2f}\t{}\t{}')
            print(
                mes.format(name, len(seq), f_p, r_p, f, f_tm, f_gc, r, r_tm,
                           r_gc, ta, len(amp)))

    print('*** Pairs passed: {}'.format(pair_pass), file=sys.stderr)
Ejemplo n.º 37
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('genome_files', nargs='+')
    p.add_argument('-o', '--output-csv', required=True)
    p.add_argument('-d', '--output-directory', required=True)
    args = p.parse_args()

    output_fp = open(args.output_csv, 'wt')
    w = csv.DictWriter(output_fp,
                       fieldnames=['ident', 'display_name', 'genome_filename'])
    w.writeheader()

    try:
        os.mkdir(args.output_directory)
        print(f"Created genome directory '{args.output_directory}'")
    except FileExistsError:
        print(f"Genome directory '{args.output_directory}' already exists.")

    print(f"Copying genomes into '{args.output_directory}'")

    n = 0
    for filename in args.genome_files:
        print(f"---")
        print(f"processing genome '{filename}'")

        for record in screed.open(filename):
            record_name = record.name
            break

        ident, *remainder = record_name.split(' ', 1)
        if remainder:  # is list, needs to be string
            remainder = remainder[0]
        else:
            remainder = ident

        print(f"read identifer '{ident}' and name '{remainder}'")

        destfile = os.path.join(args.output_directory,
                                f"{ident}_genomic.fna.gz")

        is_gzipped = False
        with contextlib.suppress(OSError):
            with gzip.open(filename) as fp:
                fp.read(1)
                is_gzipped = True

        if is_gzipped:
            print(f"copying '{filename}' to '{destfile}'")
            shutil.copyfile(filename, destfile)
        else:
            print(f"compressing '{filename}' into '{destfile}'")
            with open(filename, 'rb') as fp:
                with gzip.open(destfile, 'w') as outfp:
                    outfp.write(fp.read())

        w.writerow(
            dict(ident=ident, display_name=remainder,
                 genome_filename=destfile))
        n += 1

    output_fp.close()
    print('---')
    print(f"wrote {n} genome entries to '{args.output_csv}'")

    return 0
Ejemplo n.º 38
0
import screed, sys
'''
pip install screed
'''

fp1 = open(sys.argv[2] + '.pe1.fq', 'w')
fp2 = open(sys.argv[2] + '.pe2.fq', 'w')

n = 0
for record in screed.open(sys.argv[1]):
    if n % 2 == 0:
        fp1.write('%s\n' % ('@' + record.name))
        fp1.write('%s\n' % record.sequence)
        fp1.write('%s\n' % '+')
        fp1.write('%s\n' % record.quality)
    else:
        fp2.write('%s\n' % ('@' + record.name))
        fp2.write('%s\n' % record.sequence)
        fp2.write('%s\n' % '+')
        fp2.write('%s\n' % record.quality)

    n = n + 1
Ejemplo n.º 39
0
print 'loading ht'
ht = khmer.load_counting_hash(hashfile)
print '...done!'

K = ht.ksize()
print 'loaded ht; K is %d, n_ht is %d, size ~ %g' % (K,
                                                     len(ht.hashsizes()),
                                                     ht.hashsizes()[0])

outfp = gzip.open(output, 'w')

total = 0
total_masked = 0

for n, record in enumerate(screed.open(filename)):
    if n % 1000 == 0:
        print '...', n

    x = []
    seq = record.sequence

    total += len(seq) - K + 1
    
    pos = 0
    while pos < len(seq) - K + 1:
        kmer = seq[pos:pos + K]
        if 'N' in kmer.upper():
            x.extend(kmer)
            pos += K
            continue
Ejemplo n.º 40
0
def main():
    parser = build_counting_args()
    parser.add_argument('-C',
                        '--cutoff',
                        type=int,
                        dest='cutoff',
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-p', '--paired', action='store_true')
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash', default='')
    parser.add_argument('-R',
                        '--report-to-file',
                        dest='report_file',
                        type=argparse.FileType('w'))
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MAX_HASHSIZE and not args.loadhash:
            print(
                "** WARNING: hashsize is default!  You absodefly want to increase this!\n** Please read the docs!",
                file=sys.stderr)

        print('\nPARAMETERS:', file=sys.stderr)
        print(' - kmer size =    %d \t\t(-k)' % args.ksize, file=sys.stderr)
        print(' - n hashes =     %d \t\t(-N)' % args.n_hashes, file=sys.stderr)
        print(' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize,
              file=sys.stderr)
        print(' - paired =	      %s \t\t(-p)' % args.paired, file=sys.stderr)
        print('', file=sys.stderr)
        print(
            'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' %
            (args.n_hashes * args.min_hashsize),
            file=sys.stderr)
        print('-' * 8, file=sys.stderr)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    DESIRED_COVERAGE = args.cutoff
    report_fp = args.report_file
    filenames = args.input_filenames

    # In paired mode we read two records at a time
    batch_size = 1
    if args.paired:
        batch_size = 2

    if args.loadhash:
        print('loading hashtable from', args.loadhash)
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print('making hashtable')
        ht = khmer.CountingHash(K, HT_SIZE, N_HT)

    total = 0
    discarded = 0

    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keepmedpct'
        outfp = open(output_name, 'w')

        n = -1
        for n, batch in enumerate(
                batchwise(screed.open(input_filename), batch_size)):
            if n > 0 and n % 100000 == 0:
                print('... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%')
                print('... in file', input_filename)

                if report_fp:
                    print(total, total - discarded, \
                        1. - (discarded / float(total)), file=report_fp)
                    report_fp.flush()

            total += batch_size

            # If in paired mode, check that the reads are properly interleaved
            if args.paired:
                if not validpair(batch[0], batch[1]):
                    print('Error: Improperly interleaved pairs %s %s' %
                          (batch[0].name, batch[1].name),
                          file=sys.stderr)
                    sys.exit(-1)

            # Emit the batch of reads if any read passes the filter
            # and all reads are longer than K
            passed_filter = False
            passed_length = True
            for record in batch:
                if len(record.sequence) < K:
                    passed_length = False
                    continue

                seq = record.sequence.replace('N', 'A')
                med, avg, dev = ht.get_median_count(seq)

                pct = 0.
                if avg:
                    pct = dev / avg * 100

                if med < DESIRED_COVERAGE and pct < 100:
                    ht.consume(seq)
                    passed_filter = True

            # Emit records if any passed
            if passed_length and passed_filter:
                for record in batch:
                    if hasattr(record, 'quality'):
                        outfp.write(
                            '@%s\n%s\n+\n%s\n' %
                            (record.name, record.sequence, record.quality))
                    else:
                        outfp.write('>%s\n%s\n' %
                                    (record.name, record.sequence))
            else:
                discarded += batch_size

        if -1 < n:
            print('DONE with', input_filename, '; kept', total - discarded, 'of',\
                total, 'or', int(100. - discarded / float(total) * 100.), '%')
            print('output in', output_name)
        else:
            print('SKIPPED empty file', input_filename)

    if args.savehash:
        print('Saving hashfile through', input_filename)
        print('...saving to', args.savehash)
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print('fp rate estimated to be %1.3f' % fp_rate)

    if fp_rate > 0.20:
        print("**", file=sys.stderr)
        print("** ERROR: the counting hash is too small for", file=sys.stderr)
        print("** this data set.  Increase hashsize/num ht.", file=sys.stderr)
        print("**", file=sys.stderr)
        print("** Do not use these results!!", file=sys.stderr)
        sys.exit(-1)
#! /usr/bin/env python
import screed, sys

fp = open('test-contigs.fa', 'wt')

for n, record in enumerate(screed.open('63.fa')):
    for i in range(0, len(record.sequence), 100000):
        fragment = record.sequence[i:i + 100000]
        fp.write(f'>seq{n}.{i}\n{fragment}\n')

fp.close()
Ejemplo n.º 42
0
#!/usr/bin/env python

from khmer.utils import write_record
import screed
import sys

mutations = {
    0: (29, 19, 'T', 'A'),
    1: (19, 41, 'G', 'A'),
    2: (67, 5, 'G', 'T'),
    3: (63, 20, 'T', 'C'),
}

readnum = 0
for n, record in enumerate(screed.open(sys.argv[1])):
    if n in mutations:
        refrstart, mismatchpos, origbase, newbase = mutations[n]
        readseq = record.sequence[refrstart:refrstart + 50]
        assert readseq[mismatchpos] == origbase
        mutseq = readseq[:mismatchpos] + newbase + readseq[mismatchpos + 1:]
        oldseqname = record.name.split('-')[-1]

        readnum += 1
        record.name = 'read{}_{}_exact'.format(readnum, oldseqname)
        record.sequence = readseq
        record.quality = '3' * 50
        write_record(record, sys.stdout)

        readnum += 1
        record.name = 'read{}_{}_mismatch'.format(readnum, oldseqname)
        record.sequence = mutseq
Ejemplo n.º 43
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('normalize-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            print('ERROR: Duplicate filename--Cannot handle this!',
                  file=sys.stderr)
            print('** Exiting!', file=sys.stderr)
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(args, 'countgraph', args.force)

    # load or create counting table.
    if args.loadtable:
        print('loading k-mer counting table from ' + args.loadtable,
              file=sys.stderr)
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print('making countgraph', file=sys.stderr)
        htable = khmer_args.create_countgraph(args)

    input_filename = None

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, htable)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for e in filenames:
        files.append([e, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        if args.single_output_file is sys.stdout:
            output_name = '/dev/stdout'
        else:
            output_name = args.single_output_file.name
        outfp = args.single_output_file

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'w')

        # failsafe context manager in case an input file breaks
        with CatchIOErrors(filename, outfp, args.single_output_file,
                           args.force, corrupt_files):

            screed_iter = screed.open(filename, parse_description=False)
            reader = broken_paired_reader(screed_iter,
                                          min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in WithDiagnostics(filename, norm, reader, report_fp):
                if record is not None:
                    write_record(record, outfp)

            print('output in ' + output_name, file=sys.stderr)
            if output_name is not '/dev/stdout':
                outfp.close()

    # finished - print out some diagnostics.

    print('Total number of unique k-mers: {0}'.format(htable.n_unique_kmers()),
          file=sys.stderr)

    if args.savetable:
        print('...saving to ' + args.savetable, file=sys.stderr)
        htable.save(args.savetable)

    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
          file=sys.stderr)

    if args.force and len(corrupt_files) > 0:
        print("** WARNING: Finished with errors!", file=sys.stderr)
        print("** I/O Errors occurred in the following files:",
              file=sys.stderr)
        print("\t", " ".join(corrupt_files), file=sys.stderr)
Ejemplo n.º 44
0
def main():
    if len(sys.argv) != 3:
        mes = '*** Usage: python {} params.config file.uniq2ref.primer'
        print(
            mes.format(os.path.basename(sys.argv[0])),
            file=sys.stderr,
        )
        sys.exit(1)

    configf = sys.argv[1]
    primerfile = sys.argv[2]

    d = yaml.load(open(configf))

    pass_cnt = 0
    total_cnt = 0
    for rec in screed.open(primerfile):
        total_cnt += 1
        _name = rec.name
        name, _contig = _name.split(None, 1)
        contig_len = _contig.split('__', 1)[1]
        seq = rec.sequence
        seq_rc = RC(seq)
        # primer3 functions only accept byte-strings
        seq = seq.encode('utf-8')
        seq_rc = seq_rc.encode('utf-8')
        #seq = bytes(seq, 'utf-8')
        trig = False
        for di, seq in zip(('f', 'r'), (seq, seq_rc)):
            if has_ambiguous(seq):
                continue

            # check tm
            tm = primer3.calcTm(seq)
            if tm < d['TM_LOWER'] or tm > d['TM_UPPER']:
                continue

            # check gc
            gc = check_gc(seq)
            if gc < d['GC_LOWER'] or gc > d['GC_UPPER']:
                continue

            if d['GC_CLAMP']:
                cnt = end_gc_count(seq)
                if cnt > 3 or cnt < 1:
                    continue

            if d['SS']:
                hp = primer3.calcHairpin(seq)
                ho = primer3.calcHomodimer(seq)
                if hp.dg < d['HP_DG_LIMIT'] or hp.dg > 0:
                    continue
                if ho.dg < d['DI_DG_LIMIT'] or ho.dg > 0:
                    continue

            trig = True
            mes = '>{}__{}  contiglen__{};di__{};tm__{};gc__{}\n{}'
            print(
                mes.format(name, di, contig_len, di, tm, gc, seq),
                file=sys.stdout,
            )

        if trig:
            pass_cnt += 1

    mes = '*** # of primers (at least one direction) passed filter: {}'
    print(mes.format(pass_cnt), file=sys.stderr)
    if total_cnt == 0:
        mes = ('*** Empty file detected: {} (file.uniq2ref.primer), '
               'skipping..')
        print(
            mes.format(os.path.basename(primerfile)),
            file=sys.stderr,
        )
        sys.exit(0)
Ejemplo n.º 45
0
N_HT = 4

THRESHOLD = 0.9

filename1 = sys.argv[1]
filename2 = sys.argv[2]
uniq1 = open(os.path.basename(sys.argv[1]) + '.uniq', 'w')
uniq2 = open(os.path.basename(sys.argv[2]) + '.uniq', 'w')
paths = sys.argv[3]

kh1 = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)
kh1.consume_fasta(filename1)
kh2 = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)
kh2.consume_fasta(filename2)

for record in screed.open(paths):
    n = 0
    n_present = 0

    path = record.sequence
    n = len(path) - K + 1
    for i in range(n):
        if kh1.get(path[i:i + K]):
            n_present += 1

    if n_present / float(n) >= THRESHOLD:
        present1 = True
    else:
        present1 = False

    n = 0
Ejemplo n.º 46
0
def main():
    parser = build_construct_args()
    parser.add_argument('-C',
                        '--cutoff',
                        type=int,
                        dest='cutoff',
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash', default='')
    parser.add_argument('-R',
                        '--report-to-file',
                        dest='report_file',
                        type=argparse.FileType('w'))
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print >> sys.stderr, "** WARNING: hashsize is default!  You absodefly want to increase this!\n** Please read the docs!"

        print >> sys.stderr, '\nPARAMETERS:'
        print >> sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >> sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >> sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
        print >> sys.stderr, ''
        print >> sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (
            args.n_hashes * args.min_hashsize)
        print >> sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    DESIRED_COVERAGE = args.cutoff
    report_fp = args.report_file

    filenames = args.input_filenames

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    total = 0
    discarded = 0
    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keep'
        outfp = open(output_name, 'w')

        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 10000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

                if report_fp:
                    print>>report_fp, total, total - discarded, \
                        1. - (discarded / float(total))
                    report_fp.flush()

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.replace('N', 'A')
            med, _, _ = ht.get_median_count(seq)

            if med < DESIRED_COVERAGE:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, record.sequence))
            else:
                discarded += 1

        print 'DONE with', input_filename, '; kept', total - discarded, 'of',\
            total, 'or', int(100. - discarded / float(total) * 100.), '%'
        print 'output in', output_name

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the counting hash is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
Ejemplo n.º 47
0
def main():
    parser = build_counting_args()
    parser.add_argument("-t",
                        "--trusted-cutoff",
                        dest="trusted_cutoff",
                        type=int,
                        default=3)
    parser.add_argument(
        "--bits-theta",
        help=
        "Tuning parameter controlling trade off of speed vs alignment sensitivity",
        default=1.0,
        type=float,
        dest="bits_theta")
    parser.add_argument('-C',
                        '--cutoff',
                        type=int,
                        dest='cutoff',
                        default=DEFAULT_MINIMUM_COVERAGE)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash', default='')
    parser.add_argument('--details-out', dest="details_out")
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        print >> sys.stderr, '\nPARAMETERS:'
        print >> sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >> sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_tables
        print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \
            args.min_tablesize
        print >> sys.stderr, ''
        print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \
            '(n_hashes x min_hashsize)' % (
            args.n_tables * args.min_tablesize)
        print >> sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_tablesize
    N_HT = args.n_tables
    DESIRED_COVERAGE = args.cutoff

    filenames = args.input_filenames

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    aligner = khmer.new_readaligner(ht, args.trusted_cutoff, args.bits_theta)

    if args.details_out != None:
        details_out = open(args.details_out, "w")
    else:
        details_out = None

    total = 0
    discarded = 0
    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keepalign'
        outfp = open(output_name, 'w')

        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 10000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.upper().replace('N', 'A')

            ##
            score, graph_alignment, read_alignment, truncated = aligner.align(
                record.sequence)

            keep = False
            if truncated:
                keep = True
            else:
                if False:
                    graph_seq = graph_alignment.replace("-", "")
                else:
                    graph_seq = ""
                    for i in range(len(graph_alignment)):
                        if graph_alignment[i] == "-":
                            graph_seq += read_alignment[i]
                        else:
                            graph_seq += graph_alignment[i]

                mincount = ht.get_min_count(graph_seq)
                keep = True
                seq = graph_seq

                #if mincount < DESIRED_COVERAGE:
                #    keep = True
                #    seq = graph_seq
                #else:
                #    assert not keep

            if details_out != None:
                details_out.write(
                    "+{7}\t{0:0.2f}\t{3}\t{4}\nread:      {6}\ngraph_aln: {1}\nread_aln:  {2}\nstored_seq:{5}\n"
                    .format(score, graph_alignment, read_alignment, truncated,
                            keep, seq, record.sequence, record.name))

            if keep:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, seq))
            else:
                discarded += 1

        if total:
            print 'DONE with', input_filename, '; kept', total - discarded, 'of',\
            total, 'or', int(100. - discarded / float(total) * 100.), '%'
        print 'output in', output_name

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the counting hash is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
Ejemplo n.º 48
0
def main():
    parser = build_counting_args()
    parser.add_argument("-t",
                        "--trusted-cutoff",
                        dest="trusted_cutoff",
                        type=int,
                        default=3)
    parser.add_argument("--bits-theta",
                        help="Tuning parameter controlling"
                        "trade off of speed vs alignment sensitivity",
                        default=1.0,
                        type=float,
                        dest="bits_theta")
    parser.add_argument('--normalize-to',
                        '-Z',
                        type=int,
                        dest='normalize_to',
                        help='base cutoff on abundance',
                        default=DEFAULT_NORMALIZE_LIMIT)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash', default='')
    parser.add_argument('--details-out', dest="details_out")
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        print('\nPARAMETERS:', file=sys.stderr)
        print(' - kmer size =    %d \t\t(-k)' % args.ksize, file=sys.stderr)
        print(' - n hashes =     %d \t\t(-N)' % args.n_tables, file=sys.stderr)
        print(' - min hashsize = %-5.2g \t(-x)' % \
            args.max_tablesize, file=sys.stderr)
        print('', file=sys.stderr)
        print('Estimated memory usage is %.2g bytes ' \
            '(n_hashes x min_hashsize)' % \
            (args.n_tables * args.max_tablesize), file=sys.stderr)
        print('-' * 8, file=sys.stderr)

    K = args.ksize
    HT_SIZE = args.max_tablesize
    N_HT = args.n_tables

    filenames = args.input_filenames

    if args.loadhash:
        print('loading hashtable from', args.loadhash)
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print('making hashtable')
        ht = khmer.CountingHash(K, HT_SIZE, N_HT)

    aligner = khmer.ReadAligner(ht, args.trusted_cutoff, args.bits_theta)

    if args.details_out is not None:
        details_out = open(args.details_out, "w")
    else:
        details_out = None

    total = 0
    discarded = 0
    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keepvar'
        outfp = open(output_name, 'w')

        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 10000 == 0:
                print('... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%')
                print('... in file', input_filename)

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.upper().replace('N', 'A')

            ##

            # build the alignment...
            score, graph_alignment, read_alignment, truncated = \
                aligner.align(record.sequence)

            # next, decide whether or to keep it.
            keep = False
            if truncated:
                keep = True  # keep all truncated alignments - why?
            else:

                # build a better sequence -- this is the corrected one.
                graph_seq = graph_alignment.replace("-", "")
                # OR?
                #graph_seq = ""
                #for i in range(len(graph_alignment)):
                #    if graph_alignment[i] == "-":
                #        graph_seq += read_alignment[i]
                #    else:
                #        graph_seq += graph_alignment[i]

                # get the minimum count for this new sequence
                mincount = ht.get_min_count(graph_seq)
                if mincount < args.normalize_to:
                    keep = True

            if details_out is not None:
                details_out.write(
                    "+{7}\t{0:0.2f}\t{3}\t{4}\nread:      "
                    "{6}\ngraph_aln: {1}\nread_aln:  {2}\nstored_seq:{5}\n"
                    "".format(score, graph_alignment, read_alignment,
                              truncated, keep, seq, record.sequence,
                              record.name))

            if keep:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, record.sequence))
            else:
                discarded += 1

        if total:
            print('DONE with', input_filename, \
                '; kept', total - discarded, 'of', total, 'or', \
                int(100. - discarded / float(total) * 100.), '%')
        print('output in', output_name)

    if args.savehash:
        print('Saving hashfile through', input_filename)
        print('...saving to', args.savehash)
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht, args.force, max_false_pos=.2)
    print('fp rate estimated to be %1.3f' % fp_rate)
Ejemplo n.º 49
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('bcalm_unitigs')
    parser.add_argument('gxt_out')
    parser.add_argument('contigs_out')
    parser.add_argument('-k', '--ksize', type=int, default=31)
    parser.add_argument('-d', '--debug', action='store_true')
    parser.add_argument('-P',
                        '--pendants',
                        action="store_true",
                        help="don't remove low abundance pendants")
    args = parser.parse_args(argv)

    ksize = args.ksize

    trim = not args.pendants

    # track links between contig IDs
    link_d = collections.defaultdict(set)

    gxtfp = open(args.gxt_out, 'wt')
    contigsfp = bgzf.open(args.contigs_out, 'wb')
    info_filename = args.contigs_out + '.info.csv'
    info_fp = open(info_filename, 'wt')

    # track offsets, mean abunds, and # k-mers for each contig
    offsets = {}
    mean_abunds = {}
    sizes = {}
    sequences = {}

    # walk the input unitigs file, tracking links between contigs and
    # writing them to contigs_out.
    max_contig_id = 0
    print('reading unitigs from {}'.format(args.bcalm_unitigs))
    for n, record in enumerate(screed.open(args.bcalm_unitigs)):
        if n % 10000 == 0:
            print('...', n, file=sys.stderr, end='\r')

        name = record.name
        name_split = name.split()

        # note: contig_id may not be in order.
        contig_id = int(name_split[0])

        # track the various links
        links = [x for x in name_split[1:] if x.startswith('L:')]
        link_ids = [x.split(':')[2] for x in links]
        link_ids = [int(x) for x in link_ids]

        if args.debug:
            print('link_ids for {} are {}'.format(contig_id, link_ids))

        link_d[contig_id].update(link_ids)

        # get mean abund
        abund = [x for x in name_split[1:] if x.startswith('km:')]
        assert len(abund) == 1, abund
        abund = abund[0].split(':')
        assert len(abund) == 3
        abund = float(abund[2])

        mean_abunds[contig_id] = abund

        # where are we in the output file?
        assert contig_id not in offsets

        sequences[contig_id] = record.sequence

        sizes[contig_id] = len(record.sequence) - ksize + 1

    # if we are removing pendants, we need to relabel the contigs so they are
    # consecutive integers starting from 0.  If not, we create dummy data
    # structures to make the interface the same elsewhere in the data
    if trim:
        non_pendants = [x for x, N in link_d.items() if len(N) > 1 and \
                        mean_abunds[x] > TRIM_CUTOFF]
    else:
        non_pendants = list(link_d.keys())
    aliases = {x: i for i, x in enumerate(non_pendants)}
    n = len(aliases)

    for x, i in aliases.items():
        offsets[x] = contigsfp.tell()
        contigsfp.write('>{}\n{}\n'.format(i, sequences[x]))
    contigsfp.close()

    print('... done! {} unitigs'.format(n))

    # start the gxt file by writing the number of nodes (unitigs))
    gxtfp.write('{}\n'.format(n))

    # write out all of the links, in 'from to' format.
    n_edges = 0
    for node, edgelist in link_d.items():
        if node not in aliases:
            continue
        for next_node in edgelist:
            if next_node not in aliases:
                continue
            gxtfp.write('{} {}\n'.format(aliases[node], aliases[next_node]))
            n_edges += 1

    print('{} vertices, {} edges'.format(n, n_edges))

    info_fp.write('contig_id,offset,mean_abund,n_kmers\n')
    for v, i in aliases.items():
        info_fp.write('{},{},{:.3f},{}\n'.format(i, offsets[v], mean_abunds[v],
                                                 sizes[v]))
Ejemplo n.º 50
0
def main():
    info('sweep-reads-buffered.py', ['sweep'])
    parser = get_parser()
    args = parser.parse_args()

    if args.min_tablesize < MIN_HSIZE:
        args.min_tablesize = MIN_HSIZE
    if args.ksize < MIN_KSIZE:
        args.ksize = MIN_KSIZE

    report_on_config(args, hashtype='hashbits')

    K = args.ksize
    HT_SIZE = args.min_tablesize
    N_HT = args.n_tables

    traversal_range = args.traversal_range
    input_fastp = args.input_fastp

    if not args.outdir:
        outdir = os.path.dirname(input_fastp)
    else:
        outdir = args.outdir

    max_buffers = args.max_buffers
    output_pref = args.output_prefix
    buf_size = args.buffer_size
    max_reads = args.max_reads

    check_file_status(args.input_fastp, args.force)
    check_valid_file_exists(args.input_files)
    all_input_files = [input_fastp]
    all_input_files.extend(args.input_files)

    # Check disk space availability
    check_space(all_input_files, args.force)

    # figure out input file type (FA/FQ) -- based on first file
    ix = iter(screed.open(args.input_files[0]))
    record = ix.next()
    del ix

    extension = 'fa'
    if hasattr(record, 'accuracy'):  # fastq!
        extension = 'fq'

    output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size,
                                      output_pref, outdir, extension)

    # consume the partitioned fasta with which to label the graph
    ht = khmer.LabelHash(K, HT_SIZE, N_HT)
    try:
        print >> sys.stderr, 'consuming input sequences...'
        if args.label_by_pid:
            print >> sys.stderr, '...labeling by partition id (pid)'
            ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)
        elif args.label_by_seq:
            print >> sys.stderr, '...labeling by sequence'
            for n, record in enumerate(screed.open(input_fastp)):
                if n % 50000 == 0:
                    print >>sys.stderr, \
                        '...consumed {n} sequences...'.format(n=n)
                ht.consume_sequence_and_tag_with_labels(record.sequence, n)
        else:
            print >>sys.stderr, \
                '...labeling to create groups of size {s}'.format(
                    s=args.group_size)
            label = -1
            g = 0
            try:
                outfp = open(
                    '{pref}_base_{g}.{ext}'.format(pref=output_pref,
                                                   g=g,
                                                   ext=extension), 'wb')
                for n, record in enumerate(screed.open(input_fastp)):
                    if n % args.group_size == 0:
                        label += 1
                        if label > g:
                            g = label
                            outfp = open(
                                '{pref}_base_{g}.{ext}'.format(
                                    pref=output_pref, g=g, ext=extension),
                                'wb')
                    if n % 50000 == 0:
                        print >>sys.stderr, \
                            '...consumed {n} sequences...'.format(n=n)
                    ht.consume_sequence_and_tag_with_labels(
                        record.sequence, label)

                    if hasattr(record, 'accuracy'):
                        outfp.write('@{name}\n{seq}+{accuracy}\n'.format(
                            name=record.name,
                            seq=record.sequence,
                            accuracy=record.accuracy))
                    else:
                        outfp.write('>{name}\n{seq}\n'.format(
                            name=record.name, seq=record.sequence))

            except IOError as e:
                print >> sys.stderr, '!! ERROR !!', e
                print >> sys.stderr, '...error splitting input. exiting...'

    except IOError as e:
        print >> sys.stderr, '!! ERROR: !!', e
        print >> sys.stderr, '...error consuming \
                            {i}. exiting...'.format(i=input_fastp)

    print >> sys.stderr, 'done consuming input sequence. \
                        added {t} tags and {l} \
                        labels...'.format(t=ht.n_tags(), l=ht.n_labels())

    label_dict = defaultdict(int)
    label_number_dist = []

    n_orphaned = 0
    n_labeled = 0
    n_mlabeled = 0

    total_t = time.clock()
    start_t = time.clock()
    for read_file in args.input_files:
        print >> sys.stderr, '** sweeping {read_file} for labels...'.format(
            read_file=read_file)
        file_t = 0.0
        try:
            read_fp = screed.open(read_file)
        except IOError as error:
            print >> sys.stderr, '!! ERROR: !!', error
            print >> sys.stderr, '*** Could not open {fn}, skipping...'.format(
                fn=read_file)
        else:
            for _, record in enumerate(read_fp):
                if _ % 50000 == 0:
                    end_t = time.clock()
                    batch_t = end_t - start_t
                    file_t += batch_t
                    print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \
                                         {no} orphaned] \
                                        ** {sec}s ({sect}s total)' \
                                        .format(n=_, nc=n_labeled,
                                                no=n_orphaned,
                                                sec=batch_t, sect=file_t)
                    start_t = time.clock()
                seq = record.sequence
                name = record.name
                try:
                    labels = ht.sweep_label_neighborhood(seq, traversal_range)
                except ValueError as e:
                    pass
                else:
                    if hasattr(record, 'accuracy'):
                        seq_str = fmt_fastq(name, seq, record.accuracy, labels)
                    else:
                        seq_str = fmt_fasta(name, seq, labels)
                    label_number_dist.append(len(labels))
                    if labels:
                        n_labeled += 1
                        if len(labels) > 1:
                            output_buffer.queue(seq_str, 'multi')
                            n_mlabeled += 1
                            label_dict['multi'] += 1
                        else:
                            output_buffer.queue(seq_str, labels[0])
                            label_dict[labels[0]] += 1
                    else:
                        n_orphaned += 1
                        output_buffer.queue(seq_str, 'orphaned')
                        label_dict['orphaned'] += 1
            print >> sys.stderr, '** End of file {fn}...'.format(fn=read_file)
            output_buffer.flush_all()
            read_fp.close()

    # gotta output anything left in the buffers at the end!
    print >> sys.stderr, '** End of run...'
    output_buffer.flush_all()
    total_t = time.clock() - total_t

    if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
        print >> sys.stderr, '! WARNING: Sweep finished with errors !'
        print >> sys.stderr, '** {writee} reads not written'.format(
            writee=output_buffer.num_write_errors)
        print >> sys.stderr, '** {filee} errors opening files'.format(
            filee=output_buffer.num_file_errors)

    print >> sys.stderr, 'swept {n_reads} for labels...'.format(
        n_reads=n_labeled + n_orphaned)
    print >> sys.stderr, '...with {nc} labeled and {no} orphaned'.format(
        nc=n_labeled, no=n_orphaned)
    print >> sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled)

    print >> sys.stderr, '** outputting label number distribution...'
    fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref))
    with open(fn, 'wb') as outfp:
        for nc in label_number_dist:
            outfp.write('{nc}\n'.format(nc=nc))

    fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref))
    print >> sys.stderr, '** outputting label read counts...'
    with open(fn, 'wb') as outfp:
        for k in label_dict:
            outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
Ejemplo n.º 51
0
def main():

    info('collect-reads.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_file_status(name)

    check_space(args.input_sequence_filename)
    check_space_for_hashtable(args.n_tables * args.min_tablesize)

    print 'Saving k-mer counting table to %s' % base
    print 'Loading sequences from %s' % repr(filenames)
    if args.output:
        print 'Outputting sequences to', args.output

    print 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize)
    htable.set_use_bigcount(args.bigcount)

    total_coverage = 0.
    n = 0

    for index, filename in enumerate(filenames):
        for record in screed.open(filename):
            seq = record.sequence.upper()
            if 'N' in seq:
                seq = seq.replace('N', 'G')

            try:
                med, _, _ = htable.get_median_count(seq)
            except ValueError:
                continue

            total_coverage += med
            n += 1

            if total_coverage / float(n) > args.coverage:
                print 'reached target average coverage:', \
                      total_coverage / float(n)
                break

            htable.consume(seq)
            if args.output:
                args.output.write(output_single(record))

            if n % 100000 == 0:
                print '...', index, filename, n, total_coverage / float(n)

        if total_coverage / float(n) > args.coverage:
            break

    print 'Collected %d reads' % (n, )

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of k-mers: {0}'.format(
            htable.n_occupied())

    print 'saving', base
    htable.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filenames[-1])

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " this data set.  Increase tablesize/# tables.")
        print >> sys.stderr, "**"
        sys.exit(1)

    print 'DONE.'
Ejemplo n.º 52
0
def main():
    info('extract-paired-reads.py')
    args = get_parser().parse_args()

    check_file_status(args.infile)
    infiles = [args.infile]
    check_space(infiles)

    outfile = os.path.basename(args.infile)
    if len(sys.argv) > 2:
        outfile = sys.argv[2]

    single_fp = open(outfile + '.se', 'w')
    paired_fp = open(outfile + '.pe', 'w')

    print 'reading file "%s"' % args.infile
    print 'outputting interleaved pairs to "%s.pe"' % outfile
    print 'outputting orphans to "%s.se"' % outfile

    last_record = None
    last_name = None

    n_pe = 0
    n_se = 0

    record = None
    index = 0
    for index, record in enumerate(screed.open(sys.argv[1])):
        if index % 100000 == 0 and index > 0:
            print '...', index
        name = record['name'].split()[0]

        if last_record:
            if is_pair(last_name, name):
                paired_fp.write(output_pair(last_record, record))
                name, record = None, None
                n_pe += 1
            else:
                single_fp.write(output_single(last_record))
                n_se += 1

        last_name = name
        last_record = record

    if last_record:
        if is_pair(last_name, name):
            paired_fp.write(output_pair(last_record, record))
            name, record = None, None
            n_pe += 1
        else:
            single_fp.write(output_single(last_record))
            name, record = None, None
            n_se += 1

    if record:
        single_fp.write(output_single(record))
        n_se += 1

    single_fp.close()
    paired_fp.close()

    if n_pe == 0:
        raise Exception("no paired reads!? check file formats...")

    print 'DONE; read %d sequences, %d pairs and %d singletons' % \
          (index + 1, n_pe, n_se)

    print >> sys.stderr, 'wrote to: ' + outfile \
        + '.se' + ' and ' + outfile + '.pe'
Ejemplo n.º 53
0
def main():
    info('sample-reads-randomly.py')
    args = get_parser().parse_args()

    for _ in args.filenames:
        check_input_files(_, args.force)

    check_space(args.filenames, args.force)

    # seed the random number generator?
    if args.random_seed:
        random.seed(args.random_seed)

    # bound n_samples
    num_samples = max(args.num_samples, 1)

    #
    # Figure out what the output filename is going to be
    #

    output_file = args.output_file
    if output_file:
        if num_samples > 1:
            sys.stderr.write(
                "Error: cannot specify -o with more than one sample.")
            if not args.force:
                sys.exit(1)
        output_filename = output_file.name
    else:
        filename = args.filenames[0]
        output_filename = os.path.basename(filename) + '.subset'

    if num_samples == 1:
        print >>sys.stderr, 'Subsampling %d reads using reservoir sampling.' %\
            args.num_reads
        print >>sys.stderr, 'Subsampled reads will be placed in %s' % \
            output_filename
        print >> sys.stderr, ''
    else:  # > 1
        print >>sys.stderr, 'Subsampling %d reads, %d times,' \
            % (args.num_reads, num_samples), ' using reservoir sampling.'
        print >>sys.stderr, 'Subsampled reads will be placed in %s.N' \
            % output_filename
        print >> sys.stderr, ''

    reads = []
    for n in range(num_samples):
        reads.append([])

    # read through all the sequences and load/resample the reservoir
    for filename in args.filenames:
        print >> sys.stderr, 'opening', filename, 'for reading'
        screed_iter = screed.open(filename, parse_description=False)

        for count, (_, ispair, rcrd1, rcrd2) in enumerate(
                broken_paired_reader(screed_iter,
                                     force_single=args.force_single)):
            if count % 10000 == 0:
                print >> sys.stderr, '...', count, 'reads scanned'
                if count >= args.max_reads:
                    print >>sys.stderr, 'reached upper limit of %d reads' % \
                        args.max_reads, '(see -M); exiting'
                    break

            # collect first N reads
            if count < args.num_reads:
                for n in range(num_samples):
                    reads[n].append((rcrd1, rcrd2))
            else:
                assert len(reads[n]) <= count

                # use reservoir sampling to replace reads at random
                # see http://en.wikipedia.org/wiki/Reservoir_sampling

                for n in range(num_samples):
                    guess = random.randint(1, count)
                    if guess <= args.num_reads:
                        reads[n][guess - 1] = (rcrd1, rcrd2)

    # output all the subsampled reads:
    if len(reads) == 1:
        print >>sys.stderr, 'Writing %d sequences to %s' % \
            (len(reads[0]), output_filename)
        if not output_file:
            output_file = open(output_filename, 'w')

        for records in reads[0]:
            write_record(records[0], output_file)
            if records[1] is not None:
                write_record(records[1], output_file)
    else:
        for n in range(num_samples):
            n_filename = output_filename + '.%d' % n
            print >>sys.stderr, 'Writing %d sequences to %s' % \
                (len(reads[n]), n_filename)
            output_file = open(n_filename, 'w')
            for records in reads[n]:
                write_record(records[0], output_file)
                if records[1] is not None:
                    write_record(records[1], output_file)
Ejemplo n.º 54
0
#! /usr/bin/env python
#
# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2013. It is licensed under
# the three-clause BSD license; see doc/LICENSE.txt. Contact: [email protected]
#
import sys
import screed
import khmer

K = 32

infile = sys.argv[1]

ht = khmer.new_hashbits(K, 1, 1)
ht.consume_partitioned_fasta(infile)

for n, record in enumerate(screed.open(infile)):
    if n % 10000 == 0:
        print '... checking', n
    assert ht.is_single_partition(record.sequence)
Ejemplo n.º 55
0
def normalize_by_median(input_filename, outfp, htable, args, report_fp=None):

    desired_coverage = args.cutoff
    ksize = htable.ksize()

    # In paired mode we read two records at a time
    batch_size = 1
    if args.paired:
        batch_size = 2

    index = -1
    total = 0
    discarded = 0
    for index, batch in enumerate(batchwise(screed.open(
            input_filename), batch_size)):
        if index > 0 and index % 100000 == 0:
            print '... kept {kept} of {total} or {perc:2}%'.format(
                kept=total - discarded, total=total,
                perc=int(100. - discarded / float(total) * 100.))
            print '... in file', input_filename

            if report_fp:
                print >> report_fp, total, total - discarded, \
                    1. - (discarded / float(total))
                report_fp.flush()

        total += batch_size

        # If in paired mode, check that the reads are properly interleaved
        if args.paired:
            if not validpair(batch[0], batch[1]):
                raise IOError('Error: Improperly interleaved pairs \
                    {b0} {b1}'.format(b0=batch[0].name, b1=batch[1].name))

        # Emit the batch of reads if any read passes the filter
        # and all reads are longer than K
        passed_filter = False
        passed_length = True
        for record in batch:
            if len(record.sequence) < ksize:
                passed_length = False
                continue

            seq = record.sequence.replace('N', 'A')
            med, _, _ = htable.get_median_count(seq)

            if med < desired_coverage:
                htable.consume(seq)
                passed_filter = True

        # Emit records if any passed
        if passed_length and passed_filter:
            for record in batch:
                if hasattr(record, 'accuracy'):
                    outfp.write(
                        '@{name}\n{seq}\n'
                        '+\n{acc}\n'.format(name=record.name,
                                            seq=record.sequence,
                                            acc=record.accuracy))
                else:
                    outfp.write(
                        '>{name}\n{seq}\n'.format(name=record.name,
                                                  seq=record.sequence))
        else:
            discarded += batch_size

    return total, discarded
Ejemplo n.º 56
0
#! /usr/bin/env python
import screed
import sys
import random

random.seed(1)  # make this reproducible, please.

COVERAGE = 200
READLEN = 100
ERROR_RATE = 100

record = iter(screed.open(sys.argv[1])).next()
genome = record.sequence
len_genome = len(genome)

n_reads = int(len_genome * COVERAGE / float(READLEN))
reads_mut = 0
total_mut = 0

for i in range(n_reads):
    start = random.randint(0, len_genome - READLEN)
    read = genome[start:start + READLEN].upper()

    # reverse complement?
    if random.choice([0, 1]) == 0:
        read = screed.rc(read)

    # error?
    was_mut = False
    for _ in range(READLEN):
        while random.randint(1, ERROR_RATE) == 1:
Ejemplo n.º 57
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument("sample_id")
    p.add_argument("gather_csv")
    p.add_argument("--outdir", default="outputs")
    args = p.parse_args()

    sample_id = args.sample_id
    outdir = args.outdir.rstrip("/")

    print(f"reading gather results from {args.gather_csv}")
    rows = []
    with open(args.gather_csv, "rt") as fp:
        r = csv.DictReader(fp)
        for row in r:
            rows.append(row)
    print(f"...loaded {len(rows)} results total.")

    print("checking input/output pairs:")
    pairs = []
    fail = False
    for row in rows:
        acc = row["name"].split()[0]

        filename = f"{outdir}/mapping/{sample_id}.x.{acc}.mapped.fq.gz"
        overlapping = f"{outdir}/mapping/{sample_id}.x.{acc}.overlap.fq.gz"
        leftover = f"{outdir}/mapping/{sample_id}.x.{acc}.leftover.fq.gz"

        if not os.path.exists(filename):
            print(
                f"ERROR: input filename {filename} does not exist. Will exit.")
            fail = True

        pairs.append((acc, filename, overlapping, leftover))

    if fail:
        print("Some required input files not found - exiting.")
        sys.exit(-1)

    ignore_reads = set()
    for n, (acc, filename, overlapping, leftover) in enumerate(pairs):
        overlap_fp = gzip.open(overlapping, "wt")
        leftover_fp = gzip.open(leftover, "wt")

        print('-' * 30)
        print(f"reading sequences from {filename};")
        print(f"writing overlapping to {overlapping}")
        print(f"writing remaining to {leftover}")

        n_wrote = 0
        screed_fp = screed.open(filename)
        for record in screed_fp:
            fq = f"@{record.name}\n{record.sequence}\n+\n{record.quality}\n"
            if record.name in ignore_reads:
                overlap_fp.write(fq)
            else:
                ignore_reads.add(record.name)
                leftover_fp.write(fq)
            n_wrote += 1
        screed_fp.close()

        print(f"wrote {n_wrote} leftover records for {sample_id}.x.{acc};")
        print(f"{len(ignore_reads)} total reads to ignore moving forward.")
        print(f"file {n+1} of {len(pairs)} total")

        overlap_fp.close()
        leftover_fp.close()

    # <-- here is where we can go through the input reads and output unmapped.
    # (OR, save 'ignore_reads' and let another script handle it.)

    return 0
Ejemplo n.º 58
0
 def create_records_iter():
     print('reading cDBG nodes from {}'.format(contigs_filename))
     return screed.open(contigs_filename)
Ejemplo n.º 59
0
def main():
    parser = sanitize_help(get_parser())
    args = parser.parse_args()
    if not args.quiet:
        info('trim-low-abund.py', ['streaming'])

    configure_logging(args.quiet)

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        log_error("Error: Cannot input the same filename multiple times.")
        sys.exit(1)

    if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \
       not args.variable_coverage:
        log_error("Error: --trim-at-coverage/-Z given, but "
                  "--variable-coverage/-V not specified.")
        sys.exit(1)

    if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \
       not args.diginorm:
        log_error("Error: --diginorm-coverage given, but "
                  "--diginorm not specified.")
        sys.exit(1)

    if args.diginorm and args.single_pass:
        log_error("Error: --diginorm and --single-pass are incompatible!\n"
                  "You probably want to use normalize-by-median.py instead.")
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    if args.loadgraph:
        log_info('loading countgraph from {graph}', graph=args.loadgraph)
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        log_info('making countgraph')
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    log_info(
        'created temporary directory {temp};\n'
        'use -T to change location',
        temp=tempdir)

    trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff,
                      args.trim_at_coverage)
    if args.diginorm:
        trimmer.set_diginorm(args.diginorm_coverage)

    # ### FIRST PASS ###

    save_pass2_total = 0

    written_bp = 0
    written_reads = 0

    # only create the file writer once if outfp is specified; otherwise,
    # create it for each file.
    if args.output:
        trimfp = get_file_writer(args.output, args.gzip, args.bzip)

    pass2list = []
    for filename in args.input_filenames:
        # figure out temporary filename for 2nd pass
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        pass2fp = open(pass2filename, 'w')

        # construct output filenames
        if args.output is None:
            # note: this will be saved in trimfp.
            outfp = open(os.path.basename(filename) + '.abundtrim', 'wb')

            # get file handle w/gzip, bzip
            trimfp = get_file_writer(outfp, args.gzip, args.bzip)

        # record all this info
        pass2list.append((filename, pass2filename, trimfp))

        # input file stuff: get a broken_paired reader.
        screed_iter = screed.open(filename)
        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=args.ignore_pairs)

        # main loop through the file.
        n_start = trimmer.n_reads
        save_start = trimmer.n_saved

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass1(paired_iter, pass2fp):
            if (trimmer.n_reads - n_start) > watermark:
                log_info(
                    "... {filename} {n_saved} {n_reads} {n_bp} "
                    "{w_reads} {w_bp}",
                    filename=filename,
                    n_saved=trimmer.n_saved,
                    n_reads=trimmer.n_reads,
                    n_bp=trimmer.n_bp,
                    w_reads=written_reads,
                    w_bp=written_bp)
                watermark += REPORT_EVERY_N_READS

            # write out the trimmed/etc sequences that AREN'T going to be
            # revisited in a 2nd pass.
            write_record(read, trimfp)
            written_bp += len(read)
            written_reads += 1
        pass2fp.close()

        log_info("{filename}: kept aside {kept} of {total} from first pass",
                 filename=filename,
                 kept=trimmer.n_saved - save_start,
                 total=trimmer.n_reads - n_start)

    # first pass goes across all the data, so record relevant stats...
    n_reads = trimmer.n_reads
    n_bp = trimmer.n_bp
    n_skipped = trimmer.n_skipped
    bp_skipped = trimmer.bp_skipped
    save_pass2_total = trimmer.n_saved

    # ### SECOND PASS. ###

    # nothing should have been skipped yet!
    assert trimmer.n_skipped == 0
    assert trimmer.bp_skipped == 0

    if args.single_pass:
        pass2list = []

    # go back through all the files again.
    for _, pass2filename, trimfp in pass2list:
        log_info('second pass: looking at sequences kept aside in {pass2}',
                 pass2=pass2filename)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.  Hence, force_single=True below.

        screed_iter = screed.open(pass2filename, parse_description=False)
        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=True)

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass2(paired_iter):
            if (trimmer.n_reads - n_start) > watermark:
                log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}',
                         a=trimmer.n_reads - n_start,
                         b=pass2filename,
                         c=trimmer.n_saved,
                         d=trimmer.n_reads,
                         e=trimmer.n_bp,
                         f=written_reads,
                         g=written_bp)
                watermark += REPORT_EVERY_N_READS

            write_record(read, trimfp)
            written_reads += 1
            written_bp += len(read)

        log_info('removing {pass2}', pass2=pass2filename)
        os.unlink(pass2filename)

        # if we created our own trimfps, close 'em.
        if not args.output:
            trimfp.close()

    log_info('removing temp directory & contents ({temp})', temp=tempdir)
    shutil.rmtree(tempdir)

    trimmed_reads = trimmer.trimmed_reads

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp)
    log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp)
    log_info('looked at {st} reads twice ({np:.2f} passes)',
             st=save_pass2_total,
             np=n_passes)
    log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)',
             r=n_reads - written_reads,
             t=trimmed_reads,
             p=percent_reads_trimmed)
    log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)',
             p=(1 - (written_bp / float(n_bp))) * 100.0,
             bp=n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads
        log_info('{n} reads were high coverage ({p:.2f}%);',
                 n=n_reads - n_skipped,
                 p=percent_reads_hicov)
        log_info('skipped {r} reads/{bp} bases because of low coverage',
                 r=n_skipped,
                 bp=bp_skipped)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    log_info('output in *.abundtrim')

    if args.savegraph:
        log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph)
        ct.save(args.savegraph)
Ejemplo n.º 60
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('catlas_prefix', help='input file')
    args = parser.parse_args()

    basename = os.path.basename(args.catlas_prefix)
    cdbg = os.path.join(args.catlas_prefix, 'cdbg.gxt')
    infp = open(cdbg, 'rt')
    outname = os.path.join(args.catlas_prefix, 'cdbg.gml')
    outfp = open(outname, 'wt')

    print('reading contig sizes')
    contigsfile = os.path.join(args.catlas_prefix, 'contigs.fa.gz')
    node_sizes = {}
    for n, record in enumerate(screed.open(contigsfile)):
        node_sizes[int(record.name)] = len(record.sequence)

    print('converting {} to {}...'.format(cdbg, outname))

    writer = GmlWriter(outfp)

    num_nodes = int(next(infp))
    for x in range(num_nodes):
        writer.add_vertex(x, node_sizes.get(x, 1))

    for line in infp:
        u, v = line.split()
        writer.add_edge(int(u), int(v))

    writer.done()