def countUMINaive(molecular_barcodes, allowed_mismatches):
    """
    Tries to finds clusters of similar UMIs using a naive proximity
    approach where UMIs are sorted and the ones that are consecutive
    and has hamming distance below the given number of miss-matches will
    be clustered together.
    It returns a list with all the non clustered UMIs, for clusters of 
    multiple UMIs a random one will be selected.
    :param molecular_barcodes: a list of UMIs
    :param allowed_mismatches: how much distance we allow between clusters
    :param method: the type of distance algorithm when clustering 
                   (single more restrictive or complete less restrictive)
    :type allowed_mismatches: integer
    :type method: str 
    :return: a list of unique UMIs
    :rtype: list
    """
    clusters_dict = {}
    nclusters = 0
    for i, molecular_barcode in enumerate(sorted(molecular_barcodes)):
        if i == 0:
            clusters_dict[nclusters] = [molecular_barcode]
        else:
            # compare distant of previous molecular barcodes and new one
            # if distance is between threshold we add it to the cluster 
            # otherwise we create a new cluster
            if hamming_distance(clusters_dict[nclusters][-1], molecular_barcode) <= allowed_mismatches:
                clusters_dict[nclusters].append(molecular_barcode)
            else:
                nclusters += 1
                clusters_dict[nclusters] = [molecular_barcode]
    # Return the non clustered UMIs
    return [random.choice(members) for members in clusters_dict.itervalues()]
def countUMINaive(molecular_barcodes, allowed_mismatches):
    """
    Tries to finds clusters of similar UMIs using a naive proximity
    approach where UMIs are sorted and the ones that are consecutive
    and has hamming distance below the given number of miss-matches will
    be clustered together.
    It returns a list with all the non clustered UMIs, for clusters of 
    multiple UMIs a random one will be selected.
    :param molecular_barcodes: a list of UMIs
    :param allowed_mismatches: how much distance we allow between clusters
    :param method: the type of distance algorithm when clustering 
                   (single more restrictive or complete less restrictive)
    :type allowed_mismatches: integer
    :type method: str 
    :return: a list of unique UMIs
    :rtype: list
    """
    clusters_dict = {}
    nclusters = 0
    for i, molecular_barcode in enumerate(sorted(molecular_barcodes)):
        if i == 0:
            clusters_dict[nclusters] = [molecular_barcode]
        else:
            # compare distant of previous molecular barcodes and new one
            # if distance is between threshold we add it to the cluster
            # otherwise we create a new cluster
            if hamming_distance(clusters_dict[nclusters][-1],
                                molecular_barcode) <= allowed_mismatches:
                clusters_dict[nclusters].append(molecular_barcode)
            else:
                nclusters += 1
                clusters_dict[nclusters] = [molecular_barcode]
    # Return the non clustered UMIs
    return [random.choice(members) for members in clusters_dict.itervalues()]
 def get_adj_list_directional_adjacency(umis, counts):
     return {
         umi: [
             umi2 for umi2 in umis
             if hamming_distance(umi, umi2) <= allowed_mismatches
             and counts[umi] >= (counts[umi2] * 2) - 1
         ]
         for umi in umis
     }
Esempio n. 4
0
def affinity_umi_removal(molecular_barcodes, _):
    """
    Tries to finds clusters of similar UMIs using an affinity based approach. 
    It returns a list with all the non clustered UMIs, for clusters of 
    multiple UMIs a random one will be selected.
    :param molecular_barcodes: a list of UMIs
    :return: a list of unique UMIs
    :rtype: list
    """
    if len(molecular_barcodes) <= 2:
        return countUMINaive(molecular_barcodes, allowed_mismatches)
    words = np.asarray(molecular_barcodes)
    lev_similarity = -1 * np.array([[hamming_distance(w1,w2) for w1 in words] for w2 in words])
    affprop = AffinityPropagation(affinity="precomputed", damping=0.5)
    affprop.fit(lev_similarity)
    unique_clusters = list()
    for cluster_id in np.unique(affprop.labels_):
        exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
        cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
        unique_clusters.append(random.choice(cluster))
    return unique_clusters
 def d(coord):
     i, j = coord
     return hamming_distance(molecular_barcodes[i], molecular_barcodes[j])
 def get_adj_list_adjacency(umis):
     return {umi: [umi2 for umi2 in umis if hamming_distance(umi, umi2) \
                   <= allowed_mismatches] for umi in umis}
 def d(coord):
     i,j = coord
     return hamming_distance(molecular_barcodes[i], molecular_barcodes[j])
 def get_adj_list_directional_adjacency(umis, counts):
     return {umi: [umi2 for umi2 in umis if hamming_distance(umi, umi2) <= allowed_mismatches and
                   counts[umi] >= (counts[umi2]*2)-1] for umi in umis}  
 def get_adj_list_adjacency(umis):
     return {umi: [umi2 for umi2 in umis if hamming_distance(umi, umi2) \
                   <= allowed_mismatches] for umi in umis}
Esempio n. 10
0
 def d(coord):
     i, j = coord
     return hamming_distance(molecular_barcodes[i].encode("UTF-8"),
                             molecular_barcodes[j].encode("UTF-8"))
Esempio n. 11
0
 def get_adj_list_adjacency(umis):
     return {umi: [umi2 for umi2 in umis if hamming_distance(umi.encode("UTF-8"),
                                                             umi2.encode("UTF-8")) \
                   <= allowed_mismatches] for umi in umis}