# Remove wt entries tmp = seqarray.copy() tmp[:,wtrow] = 0 # Store results from this chunk mutarray_lil[startrow:(endrow+1),:] = tmp # Increment rows startrow = endrow+1 endrow = startrow + chunksize - 1 # Convert to csr matrix mutarray_csr = mutarray_lil.tocsr() # Return vararray as well as binary representation of wt seq return mutarray_csr, wtrow # Create sequences to test this on wtseq = 'AAAAAAAGTGAGATGGCAATCTAATTCGGCACCCCAGGTTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGG' dataset_df = simulate_library(wtseq,numseq=10000,mutrate=.1,tags=True) seqarray = dataset2seqarray(dataset_df, modeltype='MAT') mutarray, wtrow = dataset2mutarray(dataset_df, modeltype='MAT') # Print compression results seqarray_size = nbytes(seqarray) mutarray_size = nbytes(mutarray) print 'size of seqarray = %d'%seqarray_size print 'size of mutarray = %d'%mutarray_size print 'compression ratio = %.1f'%(1.*seqarray_size/mutarray_size)