コード例 #1
0
def index(filename, k):
    ''''''
    start = time.time()
    print 'indexing', filename

    mer_count = 4**k

    dbname = '.'.join(filename.split('.')[:-1]) + '.mfe_index'

    kmer_lookup = collections.defaultdict(list)

    is_empty = False
    is_db_new = True
    contig_lengths = []
    total_offset = 0

    for record in FastaIterator.parse(open(filename)):
        is_empty = False
        print record.id
        start_time = time.time()
        fasta_seq = record.seq
        dna2int.update_lookup(kmer_lookup, fasta_seq, total_offset, k)
        contig_lengths.append((record.id, len(fasta_seq)))
        total_offset += len(fasta_seq)
        print '%i bp took %.2f seconds' % (len(fasta_seq),
                                           time.time() - start_time)

    store_index(dbname, kmer_lookup, contig_lengths, k)

    print "Time used: %s" % str(time.time() - start)
    print 'Done.'
コード例 #2
0
def index(filename, k):
    ''''''
    start = time()

    mer_count = 4**k

    dbname = '.'.join(filename.split('.')[:-1]) + '.sqlite3.db'

    conn = sqlite3.connect(dbname)
    cur = conn.cursor()
    cur.executescript('''
    drop table if exists pos;
    create table pos(
    mer_id integer primary key, 
    plus text,
    minus text
    );''')

    plus = ['']*mer_count
    minus = ['']*mer_count

    is_empty = False
    is_db_new = True

    for record in FastaIterator.parse(open(filename)):
        is_empty = False
        print record.id

        fasta_seq = record.seq
	#print 'Time used: ', time() - start

        plus_mer_list = [''] * mer_count
        minus_mer_list = [''] * mer_count

        i_max = len(fasta_seq) - k
        i = 0
        kmer = fasta_seq[:k]
        while i < i_max:
            #print i, len(fasta_seq), i_max
            #print kmer
            try:
                plus_mer_id, minus_mer_id = DNA2int_2(kmer)
            except:
                #print 'Unrecognized base: %s' % fasta_seq[i+k]
                # Skip the unrecognized base, such as 'N'
                i += 1
                kmer = kmer[1:] + fasta_seq[i+k-1]
                continue

            if plus_mer_list[plus_mer_id]:
                plus_mer_list[plus_mer_id] += ',%i' % (i+k-1)
            else:
                plus_mer_list[plus_mer_id] = str(i+k-1)

            if minus_mer_list[minus_mer_id]:
                minus_mer_list[minus_mer_id] += ',%i' % (i)
            else:
                minus_mer_list[minus_mer_id] = str(i)

            i += 1
            kmer = kmer[1:] + fasta_seq[i+k-1]
            if not i % 100000:
                print "%s: %.2f%%, %s" % (record.id, i/i_max*100, str(datetime.timedelta(seconds=(time() - start))))
        else:
            pass

	#print 'Time used: ', time() - start
        for mer_id in xrange(mer_count):
            if plus_mer_list[mer_id]:
                if plus[mer_id]:
                    plus[mer_id] += ';%s:%s' % (record.id, plus_mer_list[mer_id])
                else:
                    plus[mer_id] = '%s:%s' % (record.id, plus_mer_list[mer_id])

            if minus_mer_list[mer_id]:
                if minus[mer_id]:
                    minus[mer_id] += ';%s:%s' % (record.id, minus_mer_list[mer_id])
                else:
                    minus[mer_id] = '%s:%s' % (record.id, minus_mer_list[mer_id])

        memory_percent = get_memory_percent()
        if memory_percent > 50:
            if is_db_new:
                insert_db(conn, mer_count, plus, minus)
                is_db_new = False
            else:
                update_db(conn, mer_count, plus, minus)

            # Empty the container
            plus = ['']*mer_count
            minus = ['']*mer_count
            is_empty = True

            print 'Empty plus and minus due to the memory: %s.' % memory_percent


    if not is_empty:
        if is_db_new:
            insert_db(conn, mer_count, plus, minus)
        else:
            update_db(conn, mer_count, plus, minus)

    print "Time used: %s" % str(datetime.timedelta(seconds=(time() - start)))
    print 'Done.'