コード例 #1
0
def code_seqs_shape_features(seqs, seq_len, n_seqs):
    shape_features = np.zeros((n_seqs, seq_len, 6), dtype=SHAPE_PARAM_TYPE)
    RC_shape_features = np.zeros((n_seqs, seq_len, 6), dtype=SHAPE_PARAM_TYPE)

    for i, seq in enumerate(seqs):
        shape_features[i, :, :] = code_sequence_shape(seq)
        RC_shape_features[i, :, :] = code_sequence_shape(
            reverse_complement(seq))

    return shape_features, RC_shape_features
コード例 #2
0
ファイル: shape.py プロジェクト: nboley/pyDNAbinding
def code_seqs_shape_features(seqs, seq_len, n_seqs):
    shape_features = np.zeros(
        (n_seqs, seq_len, 6), dtype=SHAPE_PARAM_TYPE)
    RC_shape_features = np.zeros(
        (n_seqs, seq_len, 6), dtype=SHAPE_PARAM_TYPE)
    
    for i, seq in enumerate(seqs):
        shape_features[i, :, :] = code_sequence_shape(seq)
        RC_shape_features[i, :, :] = code_sequence_shape(
            reverse_complement(seq))

    return shape_features, RC_shape_features
コード例 #3
0
    def load_myr_hits(self, filename):
        for ref_name, name, forward, start, end, read_ali, ref_ali \
                in iter_hit_file(filename):

            if name not in self.name_to_sequence:
                seq = sequence.sequence_from_string(read_ali.replace('-', ''))
                if not forward:
                    seq = sequence.reverse_complement(seq)
                self.add_sequence(name, seq)

            seq = self.sequences.sequence[self.name_to_sequence[name]]

            self.add_alignment('myr align', ref_name, True, start, ref_ali,
                               name, forward, 0, read_ali)
コード例 #4
0
ファイル: binding_model.py プロジェクト: nboley/pyDNAbinding
 def __init__(self, seq, fwd_coded_seq=None, rc_coded_seq=None, include_shape=False):
     self.seq = seq
     
     if fwd_coded_seq is None:
         fwd_one_hot_coded_seq = one_hot_encode_sequence(seq)
         if include_shape:
             fwd_coded_shape = code_sequence_shape(seq)
             fwd_coded_seq = np.hstack((fwd_one_hot_coded_seq, fwd_coded_shape))
         else:
             fwd_coded_seq = fwd_one_hot_coded_seq
     if rc_coded_seq is None:
         rc_seq = reverse_complement(seq)
         rc_one_hot_coded_seq = one_hot_encode_sequence(rc_seq)
         if include_shape:
             rc_coded_shape = code_sequence_shape(rc_seq)
             rc_coded_seq = np.hstack((rc_one_hot_coded_seq, rc_coded_shape))
         else:
             rc_coded_seq = rc_one_hot_coded_seq
     
     self.fwd_coded_seq = fwd_coded_seq
     self.rc_coded_seq = rc_coded_seq
コード例 #5
0
ファイル: shred.py プロジェクト: gitGNU/gnu_myrialign
def main(argv):
    if len(argv) != 2:
        print
        print 'myr shred'
        print
        print 'Generate fake Illumina reads.'
        print 'Not guaranteed to be sanely calibrated, for testing only.'
        print
        print 'Usage:'
        print
        print '    myr shred <number of reads> <sequence.fna>'
        print
        return 1

    how_many = int(argv[0])
    seq = sequence.sequence_file_iterator(argv[1]).next()[1]

    READ_SIZE = 33
    error_p = numpy.array([
        0.00912327, 0.00930828, 0.00929492, 0.00928049, 0.0093261, 0.00928905,
        0.00938066, 0.00936397, 0.00939301, 0.00947136, 0.00952966, 0.00956763,
        0.01073044, 0.01091972, 0.01121085, 0.01159389, 0.01200634, 0.01233303,
        0.01271543, 0.01334389, 0.01349712, 0.01412138, 0.01462227, 0.01720922,
        0.01617627, 0.01671721, 0.01795653, 0.01904574, 0.02032015, 0.0220367,
        0.02354595, 0.02560759, 0.03480737
    ])

    for i in xrange(how_many):
        print '>read%d' % i

        pos = random.randint(len(seq) - READ_SIZE + 1)
        read = seq[pos:pos + READ_SIZE]
        if random.randint(2): read = sequence.reverse_complement(read)

        read = read.copy()
        mutations = random.random(READ_SIZE) < error_p
        read[mutations] = (read[mutations] + random.randint(
            1, 4, size=numpy.sum(mutations)).astype('uint8')) % 4

        print sequence.string_from_sequence(read)
コード例 #6
0
ファイル: align.py プロジェクト: gitGNU/gnu_myrialign
def main(argv):
    if len(argv) < 4:
        print >> sys.stderr, ''
        print >> sys.stderr, 'myr align <max error> <indel cost> <reference.fna> <reads.fna> [<reads.fna>...]'
        print >> sys.stderr, ''
        print >> sys.stderr, 'Align short reads to a reference genome.'
        print >> sys.stderr, ''
        print >> sys.stderr, 'Files can be in FASTA or ELAND format.'
        print >> sys.stderr, ''
        print >> sys.stderr, 'Each subsitution counts as one error. The cost of an indel can be specified,'
        print >> sys.stderr, 'but must be an integer. The whole read (not just part of it) must align to '
        print >> sys.stderr, 'the reference with less than the specified maximum errors in order to'
        print >> sys.stderr, 'produce a hit.'
        print >> sys.stderr, ''
        print >> sys.stderr, 'For Illumina reads, we suggest (on the basis of very little experience):'
        print >> sys.stderr, ''
        print >> sys.stderr, '    myr align 6 2 reference.fna reads.fna'
        print >> sys.stderr, ''
        return 1

    if CELL_PROCESSOR:
        print >> sys.stderr, 'Cell processor detected'
    else:
        print >> sys.stderr, 'Cell processor not detected'

    print >> sys.stderr, 'Using', PROCESSES, 'processes'

    maxerror = int(argv[0])
    assert maxerror >= 0
    indel_cost = int(argv[1])
    assert indel_cost >= 1

    waiting = [children.Self_child() for i in xrange(PROCESSES)]
    running = []

    t1 = time.time()
    total_alignments = [0]

    def handle_events():
        for child in children.wait(running):
            message, value = child.receive()
            if message == 'done':
                running.remove(child)
                waiting.append(child)

                dt = time.time() - t1
                total_alignments[
                    0] += value // 2  # Forwards + backwards == 1 alignment
                util.show_status(
                    '%d alignments in %.2f seconds, %.4f per alignment' %
                    (total_alignments[0], dt, dt / total_alignments[0]))
            else:
                print value

    print '#Max errors:', maxerror
    print '#Indel cost:', indel_cost

    for ref_name, ref_seq in sequence.sequence_file_iterator(argv[2]):
        print '#Reference:', ref_name

        for child in waiting:
            child.send(('ref', ref_seq))

        # Collect reads of the same length,
        # and do them in batches
        buckets = {}  # length -> [ [name], [seq] ]

        def do_bucket(length, only_if_full):
            if CELL_PROCESSOR:
                #Hmmm
                chunk = 1800000 // (length * ((maxerror + 1) * 2 + 5))
                chunk -= chunk & 127
                chunk = max(chunk, 128)
            else:
                chunk = 8192

            if only_if_full and len(buckets[length][0]) < chunk:
                return

            read_names = buckets[length][0][:chunk]
            del buckets[length][0][:chunk]
            read_seqs = buckets[length][1][:chunk]
            del buckets[length][1][:chunk]

            if not buckets[length][0]:
                del buckets[length]

            while not waiting:
                handle_events()

            #print >> sys.stderr, 'Starting batch alignment of', len(read_seqs), '%d-mers'%length

            child = waiting.pop()
            child.send(
                ('align', (read_seqs, read_names, maxerror, indel_cost)))
            running.append(child)

        for read_name, read_seq in sequence.sequence_files_iterator(argv[3:]):
            length = len(read_seq)
            if length not in buckets:
                buckets[length] = ([], [])
            buckets[length][0].append(read_name + ' fwd')
            buckets[length][1].append(read_seq)
            buckets[length][0].append(read_name + ' rev')
            buckets[length][1].append(sequence.reverse_complement(read_seq))

            do_bucket(length, True)

        while buckets:
            for length in list(buckets):
                do_bucket(length, False)

        while running:
            handle_events()

    for child in waiting:
        child.close()

    util.show_status('')

    return 0
コード例 #7
0
ファイル: gff2fasta.py プロジェクト: PapenfussLab/Mungo
if options.oFilename:
    oFile = open(options.oFilename, 'w')
else:
    oFile = sys.stdout

writer = fasta.MfaWriter(oFile)
for name in data:
    s = []
    extrema = []
    for f in data[name]:
        if f.type in options.features:
            if f.strand=='+':
                start,end = f.start,f.end
                _seq = seq[start-1:end]
            else:    
                start,end = f.start,f.end
                _seq = seq[start-1:end]
                _seq = sequence.reverse_complement(_seq)
            s.append(_seq)
            extrema.append(f.start)
            extrema.append(f.end)
    start = min(extrema)
    end = max(extrema)
    if f.strand=='-':
        s.reverse()
    s = ''.join(s)
    h = 'gene_%s %s:%s-%s(%s)' % (f.extractName(),f.reference,start,end,f.strand)
    writer.write(h, s)
writer.close()
コード例 #8
0
def reverse_complement(motif):
    """Reverse complement of a motif"""
    sites = [sequence.reverse_complement(site) for site in motif.sites]
    return new_motif(sites)
コード例 #9
0
ファイル: binding_model.py プロジェクト: nboley/pyDNAbinding
 def reverse_complement(self):
     return DNASequence(
         reverse_complement(self.seq), self.rc_coded_seq, self.fwd_coded_seq)
コード例 #10
0
ファイル: extractSeq.py プロジェクト: SiriusShiu/Mungo
    action="store_true",
    dest="complement",
    help="Complement sequence", 
    default=False)
parser.add_option(
    "-b", "--reverseComplement", "--revComp",
    action="store_true",
    dest="reverseComplement",
    help="Reverse complement sequence", default=False)
options, args = parser.parse_args(sys.argv)

iFilename = args[1]
start = int(args[2])
end = int(args[3])

header,seq = fasta.load(iFilename)
s = seq[start-1:end]

h = '%s %i-%i' % (header,start,end)
if options.reverse:
    s = sequence.reverse(s)
    h += '(r)'
elif options.complement:
    s = sequence.complement(s)
    h += '(c)'
elif options.reverseComplement:
    s = sequence.reverse_complement(s)
    h += '(rc)'

fasta.pretty(h, s, width=options.width)
コード例 #11
0
    def load_velvet_graph(self, filename):
        comments = {}
        f = open(os.path.join(filename, 'stats.txt'), 'rb')
        f.readline()
        for line in f:
            ID, lgth, n_out, n_in, long_cov, short1_cov, short1_Ocov, short2_cov, short2_Ocov = line.strip(
            ).split()
            comments['NODE_' +
                     ID] = 'cov=%.1f' % (float(long_cov) + float(short1_cov) +
                                         float(short2_cov))

        f = open(os.path.join(filename, 'LastGraph'), 'rb')
        line = f.readline()
        hash_size = int(line.split()[2])
        tail_size = hash_size - 1

        while True:
            line = f.readline()
            if not line: break
            parts = line.strip().split()

            if parts[0] == 'NODE':
                node_name = 'NODE_' + parts[1]
                fwd = sequence.sequence_from_string(f.readline().strip())
                rev = sequence.sequence_from_string(f.readline().strip())
                assert len(fwd) == len(rev)
                if len(fwd) < tail_size:
                    pad = [4] * (tail_size - len(fwd))
                    fwd = numpy.concatenate((pad, fwd))
                    rev = numpy.concatenate((pad, rev))
                rev_rc = sequence.reverse_complement(rev)

                #if not numpy.alltrue(numpy.equal(fwd[:-tail_size], rev_rc[tail_size:])):
                #    print node_name
                #    print fwd[:-tail_size]
                #    print rev_rc[tail_size:]
                #    print numpy.equal(fwd[:-tail_size],rev_rc[tail_size:]).astype('int')
                #seq = numpy.concatenate((rev_rc,fwd[-tail_size:]))

                #self.add_sequence(node_name, seq)
                #print node_name

                #TODO: IUPAC codes where different
                inner_fwd = fwd[:-tail_size]
                inner_rev = rev_rc[tail_size:]
                self.add_sequence(
                    node_name,
                    numpy.concatenate(
                        (rev_rc[:tail_size],
                         numpy.where(numpy.equal(inner_fwd, inner_rev),
                                     inner_fwd, 4), fwd[-tail_size:])),
                    comments[node_name])

#self.add_sequence(node_name+'_fwd', fwd)
#self.add_sequence(node_name+'_rev', rev)
#self.add_alignment('velvet_contig_pair',
#    node_name+'_fwd', True,  0,
#    sequence.string_from_sequence(fwd[:-tail_size]),
#    node_name+'_rev', False, len(rev_rc)-tail_size-1,
#    sequence.string_from_sequence(rev_rc[tail_size:]) )

            if parts[0] == 'ARC':
                node_from = int(parts[1])
                name_from = 'NODE_%d' % abs(node_from)
                fwd_from = node_from >= 0

                node_to = int(parts[2])
                name_to = 'NODE_%d' % abs(node_to)
                fwd_to = node_to >= 0

                len_from = len(
                    self.sequences.sequence[self.name_to_sequence[name_from]])

                self.add_alignment('velvet_arc', name_from, fwd_from,
                                   len_from - tail_size, 'X' * tail_size,
                                   name_to, fwd_to, 0, 'X' * tail_size)
コード例 #12
0
                  help="Complement sequence",
                  default=False)
parser.add_option("-b",
                  "--reverseComplement",
                  "--revComp",
                  action="store_true",
                  dest="reverseComplement",
                  help="Reverse complement sequence",
                  default=False)
options, args = parser.parse_args(sys.argv)

iFilename = args[1]
start = int(args[2])
end = int(args[3])

header, seq = fasta.load(iFilename)
s = seq[start - 1:end]

h = '%s %i-%i' % (header, start, end)
if options.reverse:
    s = sequence.reverse(s)
    h += '(r)'
elif options.complement:
    s = sequence.complement(s)
    h += '(c)'
elif options.reverseComplement:
    s = sequence.reverse_complement(s)
    h += '(rc)'

fasta.pretty(h, s, width=options.width)