def main(): # Read all the seqs seqs = [ tuple( map( int, line.split() ) ) for line in sys.stdin ] # Initialize progress bar bar = ProgressBar( 0, len( seqs ), 80 ) # 'Matrix' of standardized values M = [] # Open output out = open( sys.argv[1], "w" ) # Iterate over all samples for count, ints in enumerate( seqs ): m = dict() # Standardize each N-mer that was observed R = 100 # Observed observed = kmer_counts( ints, N ) allowed_keys = observed.keys() sample_counts = dict( ( key, zeros( R ) ) for key in allowed_keys ) radix = max(ints) + 1 l = len( ints ) tx = learn( ints, radix ) for r in range( R ): seq = draw( tx, 0, l ) for i in range( l - N ): key = seq[i:i+N] if key in allowed_keys: sample_counts[key][r] += 1 for key in allowed_keys: # Standardize by quantile style sample_counts[key].sort() q = int( len( sample_counts[key] ) * .75 ) p75 = sample_counts[key][q] m[key] = observed[ key ] / p75 # Threshold style #pi = sum( observed[key] > sample_counts[key] ) / R #if pi > T: m[key] = 1 # Just standardize by length style #m[key] = observed[ key ] / l # Store the standardized values (sparsely) M.append( m ) # Progress bar update bar.update_and_print( count ) print # Compute 'distance' for i in range( len( seqs ) ): out.write( "V%d" % i ) for j in range( len( seqs ) ): in_both = 0 for key in M[i]: if key in M[j]: in_both += ( M[i][key] * M[j][key] ) # in_both = sum( [ key in M[j] for key in M[i] ] ) out.write( "\t%.9f" % in_both ) out.write( "\n" ) bar.update_and_print( i )
species = sys.argv[1].split( "," ) maf_fnames = sys.argv[2:] for fname in maf_fnames: print >> sys.stderr, "Processing", fname f = open( fname ) # Determine file size f.seek( 0, 2 ) file_size = f.tell() f.seek( 0, 0 ) bar = ProgressBar( 0, file_size, 80 ) for i, block in enumerate( bx.align.maf.Reader( f ) ): texts = get_texts( block, species ) # Increment count for each column update_counts( texts, counts ) if i % 100 == 0: bar.update_and_print( f.tell(), sys.stderr ) print >>sys.stderr, "Done." print "NSEQS =", len( species ) print "LENGTH = 2886607813" print "TUPLE_SIZE = 1" print "NTUPLES = ", len( counts ) print "NAMES = ", ",".join( species ) print "ALPHABET = ACGT" print "NCATS = -1" print for i, ( col, count ) in enumerate( counts.iteritems() ): print i, col, count
def main(): # Read affinity / similarity / kernel matrix print >> sys.stderr, "Reading" L = read_data( sys.stdin ) N = len( L ) # Make self similarities small for i in range( N ): L[i,i] = -1 nearest, nearest_sim = setup_nn( L ) # ---- Cluster ---------------------------------------------------- print >> sys.stderr, "Clustering" bar = ProgressBar( 0, N, 80 ) n_clusters = N cluster_sizes = ones( N ) good_indexes = ones( N ) stage = 0 cluster_stages = dict() while n_clusters > 1: # if n_clusters < 10: # print >> sys.stderr, "----------" # print >> sys.stderr, compress( good_indexes, compress( good_indexes, L, 1 ), 0 ) # print >> sys.stderr, "----------" # print nonzero( good_indexes ) # print "----------" # Find closest pair i, j = max_indexes( L, nearest, nearest_sim ) similarity = L[i,j] print n_clusters - 1, # Print merge info if i in cluster_stages: print cluster_stages[i] + 1, else: print - (i+1), if j in cluster_stages: print cluster_stages[j] + 1, else: print - (j+1), print similarity, print # Update L L[i] = ( L[i] * cluster_sizes[i] + L[j] * cluster_sizes[j] ) / ( cluster_sizes[i] + cluster_sizes[j] ) L[:,i] = L[i] # Keep self similarities small L[i,i] = -1 cluster_sizes[i] += cluster_sizes[j] cluster_stages[i] = stage good_indexes[j] = 0 # Just to be safe L[j] = -1; L[:,j] = -1 # Update NN nearest[j] == -1 nearest_sim[j] == -1 update_nn( L, i, j, nearest, nearest_sim ) # Counters and status stage += 1 n_clusters -= 1 sys.stdout.flush() bar.update_and_print( N-n_clusters, sys.stderr )