Example #1
0
def main(argv):
    if len(argv) != 2:
        print
        print 'myr shred'
        print
        print 'Generate fake Illumina reads.'
        print 'Not guaranteed to be sanely calibrated, for testing only.'
        print
        print 'Usage:'
        print
        print '    myr shred <number of reads> <sequence.fna>'
        print
        return 1

    how_many = int(argv[0])
    seq = sequence.sequence_file_iterator(argv[1]).next()[1]

    READ_SIZE = 33
    error_p = numpy.array([
        0.00912327, 0.00930828, 0.00929492, 0.00928049, 0.0093261, 0.00928905,
        0.00938066, 0.00936397, 0.00939301, 0.00947136, 0.00952966, 0.00956763,
        0.01073044, 0.01091972, 0.01121085, 0.01159389, 0.01200634, 0.01233303,
        0.01271543, 0.01334389, 0.01349712, 0.01412138, 0.01462227, 0.01720922,
        0.01617627, 0.01671721, 0.01795653, 0.01904574, 0.02032015, 0.0220367,
        0.02354595, 0.02560759, 0.03480737
    ])

    for i in xrange(how_many):
        print '>read%d' % i

        pos = random.randint(len(seq) - READ_SIZE + 1)
        read = seq[pos:pos + READ_SIZE]
        if random.randint(2): read = sequence.reverse_complement(read)

        read = read.copy()
        mutations = random.random(READ_SIZE) < error_p
        read[mutations] = (read[mutations] + random.randint(
            1, 4, size=numpy.sum(mutations)).astype('uint8')) % 4

        print sequence.string_from_sequence(read)
Example #2
0
    def callback(working_dir):
        print >> sys.stderr, 'Sampling'
        samples = []
        n = 0
        for item in sequence.sequence_files_iterator(read_files):
            n += 1
            if len(samples) < n_samples:
                samples.append(item)
            elif random.random() * n_samples < n:
                samples[random.randrange(n_samples)] = item

        outfile = open(os.path.join(working_dir, 'sample.fna'), 'wb')
        for item in samples:
            print >> outfile, '>%s' % item[0]
            print >> outfile, '%s' % sequence.string_from_sequence(item[1])
Example #3
0
    def show(self, cursor, distance_cutoff):
        positions = {}
        todo = []
        heapq.heappush(todo, (0, 0, cursor))

        def add_todo(location, distance, position):
            if distance > distance_cutoff:
                raise Out_of_bounds()
            if location in positions:
                return
            assert self.valid_location(location)
            heapq.heappush(todo, (distance, position, location))

        #dag = Dag()
        dag = {}

        def dag_link(a, b):
            if a not in dag: dag[a] = []
            if b not in dag: dag[b] = []
            dag[a].append(b)

        contigua = Union()

        while todo:
            distance, position, location = heapq.heappop(todo)
            if location in positions: continue
            positions[location] = position

            #dag.get_keyset(location)
            if location not in dag: dag[location] = []

            contigua.create(location)

            #flipped_location = location ^ FORWARD_MASK
            #add_todo(flipped_location, distance)
            #dag.merge_keys(flipped_location, location)

            try:
                linked_location = self.location_move(location, 1)
                contigua.merge_if_created(location, linked_location)
                add_todo(linked_location, distance + 1, position + 1)
                dag_link(location, linked_location)
            except Out_of_bounds:
                pass

            try:
                linked_location = self.location_move(location, -1)
                contigua.merge_if_created(location, linked_location)
                add_todo(linked_location, distance + 1, position - 1)
                dag_link(linked_location, location)
            except Out_of_bounds:
                pass

            def merge(linked_location):
                try:
                    add_todo(linked_location, distance, position)
                    dag_link(location, linked_location)
                    dag_link(linked_location, location)
                except Out_of_bounds:
                    pass

            for i in self.base_links.find_all('location1', location):
                merge(self.base_links.location2[i])
            for i in self.base_links.find_all('location2', location):
                merge(self.base_links.location1[i])

            for i in self.base_links.find_all('location1',
                                              location ^ FORWARD_MASK):
                merge(self.base_links.location2[i] ^ FORWARD_MASK)
            for i in self.base_links.find_all('location2',
                                              location ^ FORWARD_MASK):
                merge(self.base_links.location1[i] ^ FORWARD_MASK)

        class Contig:
            pass

        contigs = []
        for item in contigua.sets():
            sample = iter(item).next()
            seq = location_sequence(sample)
            forward = (sample & FORWARD_MASK) != 0
            contig = Contig()
            contigs.append(contig)
            contig.seq = seq
            contig.name = self.sequences.name[seq]
            contig.forward = forward
            contig.sort_key = (contig.name, not forward)
            contig.locations = item

        contigs.sort(lambda a, b: cmp(a.sort_key, b.sort_key))

        #print contigua

        #for contig in contigua:
        #    item = iter(contig).next()
        #    seq = location_sequence( item )
        #    forward = (item & FORWARD_MASK) != 0
        #    print self.sequences.name[seq], forward

        #order = dag.sort(positions)
        def priority(component):
            return float(sum([positions[item]
                              for item in component])) / len(component)

        order = sort.compact_robust_topological_sort(dag, priority)

        table = []
        column_width = []

        for x, locations in enumerate(order):
            column = []
            table.append(column)
            for y, contig in enumerate(contigs):
                relevant = [
                    location for location in locations
                    if location in contig.locations
                ]
                relevant.sort()
                if relevant and not (relevant[0] & FORWARD_MASK):
                    relevant = relevant[::-1]
                if self.cursor in relevant:
                    cursor_y = y
                    cursor_x = numpy.sum(column_width) + relevant.index(
                        self.cursor)
                    cursor_column = column
                column.append(relevant)
        #sequence.string_from_sequence( [ self.location_get(location)
        #                          for location in relevant ] ) )

            column_width.append(max([len(item) for item in column]))

        self.screen.clear()

        maxy, maxx = self.screen.getmaxyx()
        offset_y = int(maxy // 2 - cursor_y)
        offset_x = int(maxx // 2 - cursor_x)

        def addstr(y, x, string):
            if y < 0 or y >= maxy: return
            while string and x < 0:
                string = string[1:]
                x += 1
            if x + len(string) > maxx:
                string = string[:max(0, maxx - x)]
            if not string: return
            #try:
            self.screen.addstr(y, x, string)
            #except:
            #    raise repr((y,x,string,maxy,maxx))

        for y in xrange(len(contigs)):

            #item = iter(contigua[y]).next()
            #seq = location_sequence( item )
            #forward = (item & FORWARD_MASK) != 0
            #sys.stdout.write('% 20s %d  ' % (self.sequences.name[seq],forward))

            scr_x = 0
            for x in xrange(len(order)):
                item = table[x][y]
                #item += ' '*(column_width[x]-len(item))
                #sys.stdout.write(item)

                string = sequence.string_from_sequence(
                    [self.location_get(location) for location in item])

                addstr(y + offset_y, scr_x + offset_x, string)

                scr_x += column_width[x]

            #sys.stdout.write('\n')

            info = contigs[y].name
            if self.sequences.comment[contigs[y].seq]:
                info += ' ' + self.sequences.comment[contigs[y].seq]
            if contigs[y].forward:
                info += ' >>> '
            else:
                info += ' <<< '
            addstr(y + offset_y, max(0, -len(info) - 1 + offset_x), info)

        cursor_seq, cursor_fwd, cursor_pos = location_parts(cursor)
        addstr(1, 1, '%s @ %d' % (self.sequences.name[cursor_seq], cursor_pos))

        self.screen.move(cursor_y + offset_y, cursor_x + offset_x)
        self.screen.refresh()

        return cursor_column, cursor_y
Example #4
0
def align(seq1, seq2, n_errors, indel_cost):
    """ Produce an alignment (for once we have found a hit).  
        Start point is zero in both seqs.
        End point may be anywhere in seq2, must be end of seq1. """
    radius = n_errors // indel_cost

    len1 = len(seq1)
    len2 = len(seq2)
    scores = numpy.empty((len1 + 1, len2 + 1), 'int')
    #scores[:,:] = n_errors+1
    #scores[0,:] = numpy.arange(len2+1)
    #scores[:,0] = numpy.arange(len1+1)
    scores[0,:radius+2] = \
    scores[:radius+2,0] = numpy.arange(radius+2) * indel_cost

    #TODO: no need to allocate entire array

    for i in xrange(1, len1 + 1):
        #for j in xrange(1,len2+1):
        left = max(1, i - radius)
        right = min(len2, i + radius)
        if left > 1:
            scores[i, left - 1] = n_errors + 1
        if i > 1:
            scores[i - 1, right] = n_errors + 1
        for j in xrange(left, right + 1):
            scores[i, j] = min(
                scores[i - 1, j - 1] +
                sequence.NOTEQUAL[seq1[i - 1], seq2[j - 1]],
                scores[i - 1, j] + indel_cost, scores[i, j - 1] + indel_cost)

    left = max(1, len1 - radius)
    right = min(len2, len1 + radius)
    #end2 = numpy.argmin(scores[len1,1:])+1
    end2 = numpy.argmin(scores[len1, left:right + 1]) + left

    str_seq1 = sequence.string_from_sequence(seq1)
    str_seq2 = sequence.string_from_sequence(seq2)

    pos1 = len1
    pos2 = end2
    ali1 = []
    ali2 = []
    while True:
        if pos1 and pos2:
            step = scores[pos1 - 1, pos2 - 1]
            del1 = scores[pos1 - 1, pos2]
            del2 = scores[pos1, pos2 - 1]
            if step <= del1 and step <= del2:
                ali1.append(str_seq1[pos1 - 1])
                ali2.append(str_seq2[pos2 - 1])
                pos1 -= 1
                pos2 -= 1
            elif del1 <= del2:
                ali1.append(str_seq1[pos1 - 1])
                ali2.append('-')
                pos1 -= 1
            else:
                ali1.append('-')
                ali2.append(str_seq2[pos2 - 1])
                pos2 -= 1
        elif pos1:
            ali1.append(str_seq1[:pos1])
            ali2.append('-' * pos1)
            break
        else:
            ali1.append('-' * pos2)
            ali2.append(str_seq2[:pos2])
            break

    return ''.join(ali1[::-1]), ''.join(ali2[::-1]), end2, scores[len1, end2]
Example #5
0
            todo[hits.start[i]].append(i)

    lanes = []

    def find_lane():
        for start in (0, 4, 2, 6, 1, 5, 3, 7):
            for i in xrange(start, len(lanes), 8):
                if lanes[i] is None:
                    return i
        lanes.extend([None] * 8)
        return find_lane()

    pad = ' ' * 5

    total_with_a_hit = 0
    for pos, ref_nuc in enumerate(sequence.string_from_sequence(reference)):
        while lanes and lanes[-1] is None:
            del lanes[-1]

        if pos in todo:
            for i in todo[pos]:
                lane_no = find_lane()
                lanes[lane_no] = [
                    hits.ref_ali[i] + pad, hits.read_ali[i] + pad
                ]

        to_show = [ref_nuc]

        for i in xrange(len(lanes)):
            if lanes[i] is None:
                to_show.append('')