Example #1
0
def main(argv):
    if len(argv) != 2:
        print
        print 'myr shred'
        print
        print 'Generate fake Illumina reads.'
        print 'Not guaranteed to be sanely calibrated, for testing only.'
        print
        print 'Usage:'
        print
        print '    myr shred <number of reads> <sequence.fna>'
        print
        return 1

    how_many = int(argv[0])
    seq = sequence.sequence_file_iterator(argv[1]).next()[1]

    READ_SIZE = 33
    error_p = numpy.array([
        0.00912327, 0.00930828, 0.00929492, 0.00928049, 0.0093261, 0.00928905,
        0.00938066, 0.00936397, 0.00939301, 0.00947136, 0.00952966, 0.00956763,
        0.01073044, 0.01091972, 0.01121085, 0.01159389, 0.01200634, 0.01233303,
        0.01271543, 0.01334389, 0.01349712, 0.01412138, 0.01462227, 0.01720922,
        0.01617627, 0.01671721, 0.01795653, 0.01904574, 0.02032015, 0.0220367,
        0.02354595, 0.02560759, 0.03480737
    ])

    for i in xrange(how_many):
        print '>read%d' % i

        pos = random.randint(len(seq) - READ_SIZE + 1)
        read = seq[pos:pos + READ_SIZE]
        if random.randint(2): read = sequence.reverse_complement(read)

        read = read.copy()
        mutations = random.random(READ_SIZE) < error_p
        read[mutations] = (read[mutations] + random.randint(
            1, 4, size=numpy.sum(mutations)).astype('uint8')) % 4

        print sequence.string_from_sequence(read)
Example #2
0
def main(argv):
    if len(argv) < 2:
        print >> sys.stderr, ''
        print >> sys.stderr, 'myr assess <sample size> <max errors> <contigs file> <reads> [<reads> ...]'
        print >> sys.stderr, ''
        return 1

    sample_size = int(argv[0])
    max_errors = int(argv[1])

    sample_file = sample(argv[3:], sample_size)

    hit_file = invoke_align(argv[2], sample_file, max_errors)

    hits = {}
    seqs = {}
    max_length = 0
    for item in sequence.sequence_file_iterator(sample_file):
        seqs[item[0]] = item[1]
        hits[item[0]] = []
        max_length = max(len(item[1]), max_length)

    for line in open(hit_file, 'rb'):
        line = line.strip()
        if line.startswith('#'): continue

        name, direction, n_errors, span, read_ali, ref_ali = line.rstrip(
        ).split()
        hits[name].append(
            (int(n_errors), direction == 'fwd', read_ali, ref_ali))

    n_ambiguous = 0
    n_unhit = 0
    error_count = [0] * (max_errors + 1)
    error_pos_count = [0] * max_length
    indel_pos_count = [0] * max_length
    for name in hits:
        hits[name].sort()
        if not hits[name]:
            n_unhit += 1
            continue

        if len(hits[name]) > 1 and \
           hits[name][0][0]+2 > hits[name][1][0]:
            n_ambiguous += 1
            continue

        error_count[hits[name][0][0]] += 1

        forward, read_ali, ref_ali = hits[name][0][1:]
        if not forward:
            read_ali = read_ali[::-1]
            ref_ali = ref_ali[::-1]
            # Don't worry about complementing...
        read_pos = 0
        for i in xrange(len(read_ali)):
            if read_ali[i] == '-' or ref_ali[i] == '-':
                indel_pos_count[read_pos] += 1
            elif read_ali[i] != ref_ali[i]:
                error_pos_count[read_pos] += 1

            if read_ali[i] != '-':
                read_pos += 1

    print 'Error profile'
    for i in xrange(max_length):
        print 'pos=%5d snps=%5d indels=%5d' % (i + 1, error_pos_count[i],
                                               indel_pos_count[i])
    print

    print 'Sampled', len(hits), 'reads'
    print n_ambiguous, 'had no clear best hit'
    print n_unhit, 'hit nothing'
    for i in xrange(max_errors + 1 - 2):
        print '%3d errors: %d' % (i, error_count[i])
Example #3
0
 def load_sequences(self, filename):
     for name, seq in sequence.sequence_file_iterator(filename):
         self.add_sequence(name, seq)
Example #4
0
def main(argv):
    if len(argv) < 4:
        print >> sys.stderr, ''
        print >> sys.stderr, 'myr align <max error> <indel cost> <reference.fna> <reads.fna> [<reads.fna>...]'
        print >> sys.stderr, ''
        print >> sys.stderr, 'Align short reads to a reference genome.'
        print >> sys.stderr, ''
        print >> sys.stderr, 'Files can be in FASTA or ELAND format.'
        print >> sys.stderr, ''
        print >> sys.stderr, 'Each subsitution counts as one error. The cost of an indel can be specified,'
        print >> sys.stderr, 'but must be an integer. The whole read (not just part of it) must align to '
        print >> sys.stderr, 'the reference with less than the specified maximum errors in order to'
        print >> sys.stderr, 'produce a hit.'
        print >> sys.stderr, ''
        print >> sys.stderr, 'For Illumina reads, we suggest (on the basis of very little experience):'
        print >> sys.stderr, ''
        print >> sys.stderr, '    myr align 6 2 reference.fna reads.fna'
        print >> sys.stderr, ''
        return 1

    if CELL_PROCESSOR:
        print >> sys.stderr, 'Cell processor detected'
    else:
        print >> sys.stderr, 'Cell processor not detected'

    print >> sys.stderr, 'Using', PROCESSES, 'processes'

    maxerror = int(argv[0])
    assert maxerror >= 0
    indel_cost = int(argv[1])
    assert indel_cost >= 1

    waiting = [children.Self_child() for i in xrange(PROCESSES)]
    running = []

    t1 = time.time()
    total_alignments = [0]

    def handle_events():
        for child in children.wait(running):
            message, value = child.receive()
            if message == 'done':
                running.remove(child)
                waiting.append(child)

                dt = time.time() - t1
                total_alignments[
                    0] += value // 2  # Forwards + backwards == 1 alignment
                util.show_status(
                    '%d alignments in %.2f seconds, %.4f per alignment' %
                    (total_alignments[0], dt, dt / total_alignments[0]))
            else:
                print value

    print '#Max errors:', maxerror
    print '#Indel cost:', indel_cost

    for ref_name, ref_seq in sequence.sequence_file_iterator(argv[2]):
        print '#Reference:', ref_name

        for child in waiting:
            child.send(('ref', ref_seq))

        # Collect reads of the same length,
        # and do them in batches
        buckets = {}  # length -> [ [name], [seq] ]

        def do_bucket(length, only_if_full):
            if CELL_PROCESSOR:
                #Hmmm
                chunk = 1800000 // (length * ((maxerror + 1) * 2 + 5))
                chunk -= chunk & 127
                chunk = max(chunk, 128)
            else:
                chunk = 8192

            if only_if_full and len(buckets[length][0]) < chunk:
                return

            read_names = buckets[length][0][:chunk]
            del buckets[length][0][:chunk]
            read_seqs = buckets[length][1][:chunk]
            del buckets[length][1][:chunk]

            if not buckets[length][0]:
                del buckets[length]

            while not waiting:
                handle_events()

            #print >> sys.stderr, 'Starting batch alignment of', len(read_seqs), '%d-mers'%length

            child = waiting.pop()
            child.send(
                ('align', (read_seqs, read_names, maxerror, indel_cost)))
            running.append(child)

        for read_name, read_seq in sequence.sequence_files_iterator(argv[3:]):
            length = len(read_seq)
            if length not in buckets:
                buckets[length] = ([], [])
            buckets[length][0].append(read_name + ' fwd')
            buckets[length][1].append(read_seq)
            buckets[length][0].append(read_name + ' rev')
            buckets[length][1].append(sequence.reverse_complement(read_seq))

            do_bucket(length, True)

        while buckets:
            for length in list(buckets):
                do_bucket(length, False)

        while running:
            handle_events()

    for child in waiting:
        child.close()

    util.show_status('')

    return 0
Example #5
0
def read_files(argv):
    clip_start, argv = get_option_value(argv, '-s', int, 0)
    clip_end, argv = get_option_value(argv, '-e', int, 0)

    if len(argv) < 2:
        raise Bad_option(
            'Expected at least two filenames, a reference genome and and alignment file'
        )

    reference = sequence.sequence_file_iterator(argv[0]).next()[1]

    #read_hits = { }

    hits = Hits()

    #nth = 0
    for filename in argv[1:]:
        #for line in open(filename,'rb'):
        #    if not line.endswith('\n'): continue
        #    if line.startswith('#'): continue

        for ref_name, name, forward, start, end, read_ali, ref_ali \
                in iter_hit_file(filename):

            #hit = Hit()
            #hit.name, hit.direction, hit.n_errors, span, hit.read_ali, hit.ref_ali = line.rstrip().split()
            i = hits.length
            hits.resize(i + 1)
            #hits.name[i], direction, n_errors, span, hits.read_ali[i], hits.ref_ali[i] = line.rstrip().split()
            #start, end = span.split('..')
            #hits.start[i] = int(start)-1
            #hits.end[i] = int(end)
            #hits.n_errors[i] = int(n_errors)
            #hits.forward[i] = (direction == 'fwd')

            hits.name[i] = name
            hits.forward[i] = forward
            hits.start[i] = start
            hits.end[i] = end
            hits.read_ali[i] = read_ali
            hits.ref_ali[i] = ref_ali

            if clip_start or clip_end:
                if hits.forward[i]:
                    hits.read_ali[i], hits.ref_ali[
                        i], clipped_start, clipped_end = clip_alignment(
                            hits.read_ali[i], hits.ref_ali[i], clip_start,
                            clip_end)
                else:
                    hits.read_ali[i], hits.ref_ali[
                        i], clipped_start, clipped_end = clip_alignment(
                            hits.read_ali[i], hits.ref_ali[i], clip_end,
                            clip_start)
                hits.start[i] += clipped_start
                hits.end[i] -= clipped_end

            #if hits.name[i] not in read_hits:
            #read_hits[hit.name] = [ ]
            #read_hits[hit.name].append(hit)

            #nth += 1
            #if nth % 10000 == 0:
            #sys.stderr.write('Loading hits: %d            \r' % nth)
        #sys.stderr.flush()

    hits.you_are_dirty()

    return reference, hits