def reassign_reads(readsf, priors, mates, constraints, soft_assign, initial_seed): k = len(priors) if use_priors: priors = update_priors(priors, readsf, mates, constraints, soft_assign) (likelihood, read_probs) = get_read_probs(priors, mates, constraints, soft_assign) # open files read_files = [] build_files = [] for i in range(k): read_files.append(open('cluster-%d.tmp' % i,'w')) if soft_assign: build_files.append(open('cluster-%d.build.fa' % i, 'w')) if initial_seed: myk = 1 else: myk = k rsments = 0 for c in range(myk): # reassign for line in open('cluster-%d.fa' % c): if line[0] == '>': r = line[1:].strip() # remove front spaces if not read_probs.has_key(r): print 'ERROR: missing read %s scores' % r exit() elif constraints.has_key(r): if constraints[r] != c: print 'Found a constrained read in the wrong cluster' max_icm = constraints[r] else: (max_prob, max_icm) = util.max_i(read_probs[r]) # count reassignments if max_icm != c: rsments += 1 # print line to files read_files[max_icm].write(line) if soft_assign: for i in range(k): if read_probs[r][i] > soft_assign_t: if line[0] == '>': build_files[i].write('>%f;%s' % (read_probs[r][i],line[1:])) else: build_files[i].write(line) # close files for i in range(k): read_files[i].close() if soft_assign: build_files[i].close() # move tmp for i in range(k): os.rename('cluster-%d.tmp'%i, 'cluster-%d.fa'%i) return (rsments,likelihood,priors)
def init_clusters(readsf, soft_assign): # load_mates mates = {} #if matesf: ... just in case I need this later ... # for line in open(options.mates_file): # (lr,rr) = line.split() # mates[lr] = rr # mates[rr] = lr read_likes = {} for line in open('sample.fa.binning.allprobs'): a = line.split('\t') r = a[0].strip() read_likes[r] = [float(x) for x in a[1:]] k = len(a[1:]) # Note that I'm assuming here that the cluster priors # are incorporated into the printed likelihoods. I can't # be sure but I'd rather be wrong and have used a uniform # prior than reestimate the priors myself and be wrong and # double count them. However, this means I am double # counting the prior for mated reads. # assign to clusters hard_clusters = {} soft_clusters = {} for r in read_likes: if not hard_clusters.has_key(r): # mate may have been done # if mated, combine likelihood with if mates.has_key(r): m = mates[r] clust_likes = [read_likes[r][i] + read_likes[m][i] for i in range(k)] else: m = r # it works clust_likes = read_likes[r] # hard assignment (like_max, clust) = util.max_i(clust_likes) hard_clusters[r] = clust hard_clusters[m] = clust # soft assignment if soft_assign: sum_score = clust_likes[0] for i in range(1,k): sum_score = imm_cluster.log_add(sum_score, clust_likes[i]) soft_clusters[r] = [] for i in range(k): prob = math.exp(clust_likes[i] - sum_score) if r != m: soft_clusters[m] = [] if prob > imm_cluster.soft_assign_t: soft_clusters[r].append((i,prob)) if r != m: soft_clusters[m].append((i,prob)) chunk_size = 50 chunk_i = 0 while chunk_i*chunk_size < k: # open files init_files = {} build_files = {} for c in range(chunk_i*chunk_size, min(k, (chunk_i+1)*chunk_size)): init_files[c] = open('cluster-%d.fa' % c, 'w') if soft_assign: build_files[c] = open('cluster-%d.build.fa' % c, 'w') # read fasta to cluster-*.fa for line in open(readsf): if line[0] == '>': r = line[1:].strip() # front spaces are removed by LikelyBin if hard_clusters.has_key(r): hc = hard_clusters[r] if init_files.has_key(hc): init_files[hc].write(line) if soft_assign: for (sc,p) in soft_clusters[r]: build_files[sc].write('>%f;%s' % (p,line[1:])) else: hc = -1 elif hc != -1: if init_files.has_key(hc): init_files[hc].write(line) if soft_assign: for (sc,p) in soft_clusters[r]: build_files[sc].write(line) # close files for c in init_files: init_files[c].close() if soft_assign: build_files[c].close() # increment chunk_i += 1