def main():
    # Read all the seqs
    seqs = [ tuple( map( int, line.split() ) ) for line in sys.stdin ]
    # Initialize progress bar
    bar = ProgressBar( 0, len( seqs ), 80 )
    # 'Matrix' of standardized values
    M = []
    # Open output
    out = open( sys.argv[1], "w" )
    # Iterate over all samples
    for count, ints in enumerate( seqs ):
        m = dict()
        # Standardize each N-mer that was observed
        R = 100
        # Observed 
        observed = kmer_counts( ints, N )
        allowed_keys = observed.keys()
        sample_counts = dict( ( key, zeros( R ) ) for key in allowed_keys )
        radix = max(ints) + 1
        l = len( ints )       
        tx = learn( ints, radix )
        for r in range( R ):
            seq = draw( tx, 0, l )
            for i in range( l - N ):
                key = seq[i:i+N]
                if key in allowed_keys:
                    sample_counts[key][r] += 1
        for key in allowed_keys:
            # Standardize by quantile style
            sample_counts[key].sort()
            q = int( len( sample_counts[key] ) * .75 )
            p75 = sample_counts[key][q]
            m[key] = observed[ key ] / p75
            # Threshold style
            #pi = sum( observed[key] > sample_counts[key] ) / R
            #if pi > T: m[key] = 1
            # Just standardize by length style
            #m[key] = observed[ key ] / l
            
        # Store the standardized values (sparsely)
        M.append( m )
        # Progress bar update
        bar.update_and_print( count )
    print
    # Compute 'distance'
    for i in range( len( seqs ) ):
        out.write( "V%d" % i )
        for j in range( len( seqs ) ):
            in_both = 0
            for key in M[i]:
                if key in M[j]:
                    in_both += ( M[i][key] * M[j][key] )
            # in_both = sum( [ key in M[j] for key in M[i] ] )
            out.write( "\t%.9f" % in_both )
        out.write( "\n" )    
        bar.update_and_print( i )
Ejemplo n.º 2
0
        #try: counts[ col ] += 1
        #except: counts[ col ] = 1

counts = {}

species = sys.argv[1].split( "," )
maf_fnames = sys.argv[2:]

for fname in maf_fnames:
    print >> sys.stderr, "Processing", fname
    f = open( fname )
    # Determine file size
    f.seek( 0, 2 )
    file_size = f.tell()
    f.seek( 0, 0 )
    bar = ProgressBar( 0, file_size, 80 )
    for i, block in enumerate( bx.align.maf.Reader( f ) ):
        texts = get_texts( block, species )
        # Increment count for each column
        update_counts( texts, counts )
        if i % 100 == 0:
            bar.update_and_print( f.tell(), sys.stderr )
    print >>sys.stderr, "Done."

print "NSEQS =", len( species )
print "LENGTH = 2886607813"
print "TUPLE_SIZE = 1"
print "NTUPLES = ", len( counts )
print "NAMES = ", ",".join( species )
print "ALPHABET = ACGT"
print "NCATS = -1"
Ejemplo n.º 3
0
def main():
    # Read affinity / similarity / kernel matrix
    print >> sys.stderr, "Reading"
    L = read_data( sys.stdin )
    N = len( L )
    # Make self similarities small
    for i in range( N ): L[i,i] = -1
    
    nearest, nearest_sim = setup_nn( L )
    
    # ---- Cluster ----------------------------------------------------
        
    print >> sys.stderr, "Clustering"
    bar = ProgressBar( 0, N, 80 )
    
    
    n_clusters = N
    cluster_sizes = ones( N )
    good_indexes = ones( N )
    stage = 0
    cluster_stages = dict()
    while n_clusters > 1:
#       if n_clusters < 10:
#             print >> sys.stderr, "----------"
#             print >> sys.stderr, compress( good_indexes, compress( good_indexes, L, 1 ), 0 )
#             print >> sys.stderr, "----------"
#             print nonzero( good_indexes )
#             print "----------"
        # Find closest pair
        i, j = max_indexes( L, nearest, nearest_sim )
        similarity = L[i,j]
        print n_clusters - 1,
        # Print merge info
        if i in cluster_stages: 
            print cluster_stages[i] + 1,
        else: 
            print - (i+1),
        if j in cluster_stages: 
            print cluster_stages[j] + 1,
        else: 
            print - (j+1),
        print similarity,
        print
        # Update L
        L[i] = ( L[i] * cluster_sizes[i] + L[j] * cluster_sizes[j] ) / ( cluster_sizes[i] + cluster_sizes[j] )
        L[:,i] = L[i]
        # Keep self similarities small
        L[i,i] = -1
        cluster_sizes[i] += cluster_sizes[j]
        cluster_stages[i] = stage
        good_indexes[j] = 0   
        # Just to be safe
        L[j] = -1; L[:,j] = -1
        # Update NN
        nearest[j] == -1
        nearest_sim[j] == -1
        update_nn( L, i, j, nearest, nearest_sim )
        # Counters and status
        stage += 1
        n_clusters -= 1
        sys.stdout.flush()
        bar.update_and_print( N-n_clusters, sys.stderr )