def gen_read(strand, read_len): startpos = random.randint(0, len(strand) - read_len - 1) seq = list(strand[startpos : startpos + read_len]) for i in xrange(len(seq)): probs = MISREAD_PROBS[MISREAD_BASE_TO_IDX[seq[i]]] seq[i] = MISREAD_IDX_TO_BASE[utils.loaded_dice(probs)] return "".join(seq)
def kpp(k, reads): centroids = [prepare_centroid(random.choice(reads), reads)] while len(centroids) < k: sqrd_distances = [min_distance(centroids, read) ** 2 for read in reads] sqrd_distances_sum = sum(sqrd_distances) probs = [d / sqrd_distances_sum for d in sqrd_distances] new_centroid_idx = utils.loaded_dice(probs) new_centroid = prepare_centroid(reads[new_centroid_idx], reads) if new_centroid in centroids: continue # ensure that we don't have similar centroids centroids.append(new_centroid) return centroids
def prepare_centroid(centroid, reads): if not any(c == "_" for c in centroid): return centroid # centroid doesn't have missed bases, return original one centroid_lst = list(centroid) probs_nonnorm = [1. - distance(centroid, read) for read in reads] for i in xrange(len(centroid)): if centroid[i] != "_": continue # we should drop reads that contain "_" in this position probs_nonnorm_copy = probs_nonnorm[:] # copying an array for read_idx in xrange(len(reads)): if reads[read_idx][i] == "_": probs_nonnorm_copy[read_idx] = 0.0 if sum(probs_nonnorm_copy) == 0.0: continue # all reads contain _ in this position, despair donor_idx = utils.loaded_dice(normalize(probs_nonnorm_copy)) centroid_lst[i] = reads[donor_idx][i] return "".join(centroid_lst)