Beispiel #1
0
def gen_read(strand, read_len):
    startpos = random.randint(0, len(strand) - read_len - 1)
    seq = list(strand[startpos : startpos + read_len])
    for i in xrange(len(seq)):
        probs = MISREAD_PROBS[MISREAD_BASE_TO_IDX[seq[i]]]
        seq[i] = MISREAD_IDX_TO_BASE[utils.loaded_dice(probs)]
    return "".join(seq)
Beispiel #2
0
def kpp(k, reads):
    centroids = [prepare_centroid(random.choice(reads), reads)]
    while len(centroids) < k:
        sqrd_distances = [min_distance(centroids, read) ** 2 for read in reads]
        sqrd_distances_sum = sum(sqrd_distances)
        probs = [d / sqrd_distances_sum for d in sqrd_distances]
        new_centroid_idx = utils.loaded_dice(probs)
        new_centroid = prepare_centroid(reads[new_centroid_idx], reads)
        if new_centroid in centroids:
            continue # ensure that we don't have similar centroids
        centroids.append(new_centroid)
    return centroids
Beispiel #3
0
def prepare_centroid(centroid, reads):
    if not any(c == "_" for c in centroid):
        return centroid # centroid doesn't have missed bases, return original one
    centroid_lst = list(centroid)
    probs_nonnorm = [1. - distance(centroid, read) for read in reads]
    for i in xrange(len(centroid)):
        if centroid[i] != "_":
            continue
        # we should drop reads that contain "_" in this position
        probs_nonnorm_copy = probs_nonnorm[:] # copying an array
        for read_idx in xrange(len(reads)):
            if reads[read_idx][i] == "_":
                probs_nonnorm_copy[read_idx] = 0.0
        if sum(probs_nonnorm_copy) == 0.0:
            continue # all reads contain _ in this position, despair
        donor_idx = utils.loaded_dice(normalize(probs_nonnorm_copy))
        centroid_lst[i] = reads[donor_idx][i]
    return "".join(centroid_lst)