def code_seqs_shape_features(seqs, seq_len, n_seqs): shape_features = np.zeros((n_seqs, seq_len, 6), dtype=SHAPE_PARAM_TYPE) RC_shape_features = np.zeros((n_seqs, seq_len, 6), dtype=SHAPE_PARAM_TYPE) for i, seq in enumerate(seqs): shape_features[i, :, :] = code_sequence_shape(seq) RC_shape_features[i, :, :] = code_sequence_shape( reverse_complement(seq)) return shape_features, RC_shape_features
def code_seqs_shape_features(seqs, seq_len, n_seqs): shape_features = np.zeros( (n_seqs, seq_len, 6), dtype=SHAPE_PARAM_TYPE) RC_shape_features = np.zeros( (n_seqs, seq_len, 6), dtype=SHAPE_PARAM_TYPE) for i, seq in enumerate(seqs): shape_features[i, :, :] = code_sequence_shape(seq) RC_shape_features[i, :, :] = code_sequence_shape( reverse_complement(seq)) return shape_features, RC_shape_features
def load_myr_hits(self, filename): for ref_name, name, forward, start, end, read_ali, ref_ali \ in iter_hit_file(filename): if name not in self.name_to_sequence: seq = sequence.sequence_from_string(read_ali.replace('-', '')) if not forward: seq = sequence.reverse_complement(seq) self.add_sequence(name, seq) seq = self.sequences.sequence[self.name_to_sequence[name]] self.add_alignment('myr align', ref_name, True, start, ref_ali, name, forward, 0, read_ali)
def __init__(self, seq, fwd_coded_seq=None, rc_coded_seq=None, include_shape=False): self.seq = seq if fwd_coded_seq is None: fwd_one_hot_coded_seq = one_hot_encode_sequence(seq) if include_shape: fwd_coded_shape = code_sequence_shape(seq) fwd_coded_seq = np.hstack((fwd_one_hot_coded_seq, fwd_coded_shape)) else: fwd_coded_seq = fwd_one_hot_coded_seq if rc_coded_seq is None: rc_seq = reverse_complement(seq) rc_one_hot_coded_seq = one_hot_encode_sequence(rc_seq) if include_shape: rc_coded_shape = code_sequence_shape(rc_seq) rc_coded_seq = np.hstack((rc_one_hot_coded_seq, rc_coded_shape)) else: rc_coded_seq = rc_one_hot_coded_seq self.fwd_coded_seq = fwd_coded_seq self.rc_coded_seq = rc_coded_seq
def main(argv): if len(argv) != 2: print print 'myr shred' print print 'Generate fake Illumina reads.' print 'Not guaranteed to be sanely calibrated, for testing only.' print print 'Usage:' print print ' myr shred <number of reads> <sequence.fna>' print return 1 how_many = int(argv[0]) seq = sequence.sequence_file_iterator(argv[1]).next()[1] READ_SIZE = 33 error_p = numpy.array([ 0.00912327, 0.00930828, 0.00929492, 0.00928049, 0.0093261, 0.00928905, 0.00938066, 0.00936397, 0.00939301, 0.00947136, 0.00952966, 0.00956763, 0.01073044, 0.01091972, 0.01121085, 0.01159389, 0.01200634, 0.01233303, 0.01271543, 0.01334389, 0.01349712, 0.01412138, 0.01462227, 0.01720922, 0.01617627, 0.01671721, 0.01795653, 0.01904574, 0.02032015, 0.0220367, 0.02354595, 0.02560759, 0.03480737 ]) for i in xrange(how_many): print '>read%d' % i pos = random.randint(len(seq) - READ_SIZE + 1) read = seq[pos:pos + READ_SIZE] if random.randint(2): read = sequence.reverse_complement(read) read = read.copy() mutations = random.random(READ_SIZE) < error_p read[mutations] = (read[mutations] + random.randint( 1, 4, size=numpy.sum(mutations)).astype('uint8')) % 4 print sequence.string_from_sequence(read)
def main(argv): if len(argv) < 4: print >> sys.stderr, '' print >> sys.stderr, 'myr align <max error> <indel cost> <reference.fna> <reads.fna> [<reads.fna>...]' print >> sys.stderr, '' print >> sys.stderr, 'Align short reads to a reference genome.' print >> sys.stderr, '' print >> sys.stderr, 'Files can be in FASTA or ELAND format.' print >> sys.stderr, '' print >> sys.stderr, 'Each subsitution counts as one error. The cost of an indel can be specified,' print >> sys.stderr, 'but must be an integer. The whole read (not just part of it) must align to ' print >> sys.stderr, 'the reference with less than the specified maximum errors in order to' print >> sys.stderr, 'produce a hit.' print >> sys.stderr, '' print >> sys.stderr, 'For Illumina reads, we suggest (on the basis of very little experience):' print >> sys.stderr, '' print >> sys.stderr, ' myr align 6 2 reference.fna reads.fna' print >> sys.stderr, '' return 1 if CELL_PROCESSOR: print >> sys.stderr, 'Cell processor detected' else: print >> sys.stderr, 'Cell processor not detected' print >> sys.stderr, 'Using', PROCESSES, 'processes' maxerror = int(argv[0]) assert maxerror >= 0 indel_cost = int(argv[1]) assert indel_cost >= 1 waiting = [children.Self_child() for i in xrange(PROCESSES)] running = [] t1 = time.time() total_alignments = [0] def handle_events(): for child in children.wait(running): message, value = child.receive() if message == 'done': running.remove(child) waiting.append(child) dt = time.time() - t1 total_alignments[ 0] += value // 2 # Forwards + backwards == 1 alignment util.show_status( '%d alignments in %.2f seconds, %.4f per alignment' % (total_alignments[0], dt, dt / total_alignments[0])) else: print value print '#Max errors:', maxerror print '#Indel cost:', indel_cost for ref_name, ref_seq in sequence.sequence_file_iterator(argv[2]): print '#Reference:', ref_name for child in waiting: child.send(('ref', ref_seq)) # Collect reads of the same length, # and do them in batches buckets = {} # length -> [ [name], [seq] ] def do_bucket(length, only_if_full): if CELL_PROCESSOR: #Hmmm chunk = 1800000 // (length * ((maxerror + 1) * 2 + 5)) chunk -= chunk & 127 chunk = max(chunk, 128) else: chunk = 8192 if only_if_full and len(buckets[length][0]) < chunk: return read_names = buckets[length][0][:chunk] del buckets[length][0][:chunk] read_seqs = buckets[length][1][:chunk] del buckets[length][1][:chunk] if not buckets[length][0]: del buckets[length] while not waiting: handle_events() #print >> sys.stderr, 'Starting batch alignment of', len(read_seqs), '%d-mers'%length child = waiting.pop() child.send( ('align', (read_seqs, read_names, maxerror, indel_cost))) running.append(child) for read_name, read_seq in sequence.sequence_files_iterator(argv[3:]): length = len(read_seq) if length not in buckets: buckets[length] = ([], []) buckets[length][0].append(read_name + ' fwd') buckets[length][1].append(read_seq) buckets[length][0].append(read_name + ' rev') buckets[length][1].append(sequence.reverse_complement(read_seq)) do_bucket(length, True) while buckets: for length in list(buckets): do_bucket(length, False) while running: handle_events() for child in waiting: child.close() util.show_status('') return 0
if options.oFilename: oFile = open(options.oFilename, 'w') else: oFile = sys.stdout writer = fasta.MfaWriter(oFile) for name in data: s = [] extrema = [] for f in data[name]: if f.type in options.features: if f.strand=='+': start,end = f.start,f.end _seq = seq[start-1:end] else: start,end = f.start,f.end _seq = seq[start-1:end] _seq = sequence.reverse_complement(_seq) s.append(_seq) extrema.append(f.start) extrema.append(f.end) start = min(extrema) end = max(extrema) if f.strand=='-': s.reverse() s = ''.join(s) h = 'gene_%s %s:%s-%s(%s)' % (f.extractName(),f.reference,start,end,f.strand) writer.write(h, s) writer.close()
def reverse_complement(motif): """Reverse complement of a motif""" sites = [sequence.reverse_complement(site) for site in motif.sites] return new_motif(sites)
def reverse_complement(self): return DNASequence( reverse_complement(self.seq), self.rc_coded_seq, self.fwd_coded_seq)
action="store_true", dest="complement", help="Complement sequence", default=False) parser.add_option( "-b", "--reverseComplement", "--revComp", action="store_true", dest="reverseComplement", help="Reverse complement sequence", default=False) options, args = parser.parse_args(sys.argv) iFilename = args[1] start = int(args[2]) end = int(args[3]) header,seq = fasta.load(iFilename) s = seq[start-1:end] h = '%s %i-%i' % (header,start,end) if options.reverse: s = sequence.reverse(s) h += '(r)' elif options.complement: s = sequence.complement(s) h += '(c)' elif options.reverseComplement: s = sequence.reverse_complement(s) h += '(rc)' fasta.pretty(h, s, width=options.width)
def load_velvet_graph(self, filename): comments = {} f = open(os.path.join(filename, 'stats.txt'), 'rb') f.readline() for line in f: ID, lgth, n_out, n_in, long_cov, short1_cov, short1_Ocov, short2_cov, short2_Ocov = line.strip( ).split() comments['NODE_' + ID] = 'cov=%.1f' % (float(long_cov) + float(short1_cov) + float(short2_cov)) f = open(os.path.join(filename, 'LastGraph'), 'rb') line = f.readline() hash_size = int(line.split()[2]) tail_size = hash_size - 1 while True: line = f.readline() if not line: break parts = line.strip().split() if parts[0] == 'NODE': node_name = 'NODE_' + parts[1] fwd = sequence.sequence_from_string(f.readline().strip()) rev = sequence.sequence_from_string(f.readline().strip()) assert len(fwd) == len(rev) if len(fwd) < tail_size: pad = [4] * (tail_size - len(fwd)) fwd = numpy.concatenate((pad, fwd)) rev = numpy.concatenate((pad, rev)) rev_rc = sequence.reverse_complement(rev) #if not numpy.alltrue(numpy.equal(fwd[:-tail_size], rev_rc[tail_size:])): # print node_name # print fwd[:-tail_size] # print rev_rc[tail_size:] # print numpy.equal(fwd[:-tail_size],rev_rc[tail_size:]).astype('int') #seq = numpy.concatenate((rev_rc,fwd[-tail_size:])) #self.add_sequence(node_name, seq) #print node_name #TODO: IUPAC codes where different inner_fwd = fwd[:-tail_size] inner_rev = rev_rc[tail_size:] self.add_sequence( node_name, numpy.concatenate( (rev_rc[:tail_size], numpy.where(numpy.equal(inner_fwd, inner_rev), inner_fwd, 4), fwd[-tail_size:])), comments[node_name]) #self.add_sequence(node_name+'_fwd', fwd) #self.add_sequence(node_name+'_rev', rev) #self.add_alignment('velvet_contig_pair', # node_name+'_fwd', True, 0, # sequence.string_from_sequence(fwd[:-tail_size]), # node_name+'_rev', False, len(rev_rc)-tail_size-1, # sequence.string_from_sequence(rev_rc[tail_size:]) ) if parts[0] == 'ARC': node_from = int(parts[1]) name_from = 'NODE_%d' % abs(node_from) fwd_from = node_from >= 0 node_to = int(parts[2]) name_to = 'NODE_%d' % abs(node_to) fwd_to = node_to >= 0 len_from = len( self.sequences.sequence[self.name_to_sequence[name_from]]) self.add_alignment('velvet_arc', name_from, fwd_from, len_from - tail_size, 'X' * tail_size, name_to, fwd_to, 0, 'X' * tail_size)
help="Complement sequence", default=False) parser.add_option("-b", "--reverseComplement", "--revComp", action="store_true", dest="reverseComplement", help="Reverse complement sequence", default=False) options, args = parser.parse_args(sys.argv) iFilename = args[1] start = int(args[2]) end = int(args[3]) header, seq = fasta.load(iFilename) s = seq[start - 1:end] h = '%s %i-%i' % (header, start, end) if options.reverse: s = sequence.reverse(s) h += '(r)' elif options.complement: s = sequence.complement(s) h += '(c)' elif options.reverseComplement: s = sequence.reverse_complement(s) h += '(rc)' fasta.pretty(h, s, width=options.width)