Esempio n. 1
0
def gapped_edit_dist(seq1_str, seq2_str):

    d1 = ag.normalizedAffineGapDistance(seq1_str,
                                        seq2_str,
                                        matchWeight=0,
                                        mismatchWeight=1,
                                        gapWeight=1,
                                        spaceWeight=5,
                                        abbreviation_scale=1)

    return d1
Esempio n. 2
0
def affinegaphClustering(data):
    # distance matric
    print(data)
    dis = []

    for n in range(len(data)):
        row = []
        for i in range(0, len(data)):
            d3 = affinegap.normalizedAffineGapDistance(data[n], data[i])
            row.append(d3)
        dis.append(row)
    print(dis)
    linkage = fastcluster.linkage(dis, method="complete")
    b = fcluster(linkage, t=0.99, criterion='inconsistent')
    print(b)
    return b
Esempio n. 3
0
def pairwise_affine_gap_distance(x1, x2):
    """Calculate normalized affine gap distance between two samples x1 and x2.
    Parameters
    ----------
    x1: sample 1, array [n_features]
    x2: sample 2, array [n_features]
    Returns
    -------
    agap_distance: float, normalized affine gap distance of a given pair
    """
    agap_distance = 0
    assert (len(x1) == len(x2))
    for i in range(len(x1)):
        str1 = str(x1[i]) if x1[i] else ''
        str2 = str(x2[i]) if x2[i] else ''
        if not str1 and not str2:
            agap_distance += 0.5
        else:
            agap_distance += normalizedAffineGapDistance(str1, str2)
    return agap_distance
Esempio n. 4
0
def affinegapFastClustering(clusternum,data):
    # distance matric
    dis = []
    for n in range(len(data)):
        row = []
        for i in range(0, len(data)):
            d3 = affinegap.normalizedAffineGapDistance(data[n], data[i])
            row.append(d3)
        dis.append(row)
    linkage = fastcluster.linkage(dis, method="complete")
    # clusternum = 2
    clustdict = {i: [i] for i in range(len(linkage) + 1)}
    for i in range(len(linkage) - clusternum + 1):
        clust1 = int(linkage[i][0])
        clust2 = int(linkage[i][1])
        clustdict[max(clustdict) + 1] = clustdict[clust1] + clustdict[clust2]
        del clustdict[clust1], clustdict[clust2]

    print(clustdict)
    return clustdict
Esempio n. 5
0
def dedupe_affine_gap(s1, s2):
    return pd.Series(list(zip(s1, s2))).apply(lambda x: normalizedAffineGapDistance(x[0], x[1]))
Esempio n. 6
0
def hbfClusterViewComplexMinhashaffinegap(corahbf, dataset, acr_threshold,
                                          username, dis_threshold):
    cluster_dict = {}
    ordered_layers_dict = corahbf.get_orderlayers_dict()
    print(json.dumps(ordered_layers_dict))
    lendict = len(ordered_layers_dict)
    total_records = corahbf.get_dataset_records_ids()
    for k in range(lendict, 0, -1):
        layer = corahbf.get_single_layer(ordered_layers_dict[k])
        print(layer.get_layerName(), " at layer:", layer.get_level())
        # check the layer-ACR
        all_children_dict = layer.get_all_children()
        if 'NULL' in all_children_dict.keys():
            nullNode = layer.get_single_child('NULL')
            nullNodeRecords = nullNode.get_index_recordids()

            print('nullNodeRecords:', nullNodeRecords)
            layer_ACR = 1 - len(nullNodeRecords) / len(total_records)
        else:
            layer_ACR = 1
        # skip
        if layer_ACR < acr_threshold:
            continue
        else:
            count = 0
            ordered_nodes_dict = layer.get_ordered_nodes_dict()
            for order, value in ordered_nodes_dict.items():
                if value == 'NULL':
                    continue
                node = layer.get_single_child(value)
                #
                print('start spliting')
                nodeRecords = node.get_index_recordids()
                if len(nodeRecords) > 20:
                    print('start collect subrecords')
                    subrecords = collectSubrecordsSimple(node=node,
                                                         dataset=dataset,
                                                         username=username)
                    print('start hclustering')
                    clustdict = dextrapreclustering.affinegapSimpleClustering(
                        data=subrecords)
                    cluster_membership = {}
                    for k, v in clustdict.items():
                        for d in v:
                            cluster_membership[d] = k
                    subgroups = [
                        cluster_membership[i] for i in range(len(nodeRecords))
                    ]
                else:
                    subgroups = [1 for k in nodeRecords]
                for i in range(len(subgroups)):
                    cluster_dict[nodeRecords[i]] = subgroups[i] + count
                count = count + len(subgroups)
            # merge
            print('start merging-------')
            if nullNodeRecords:

                notnullrecords = [k for k in cluster_dict.keys()]
                notnulls = dataset.filter(id__in=notnullrecords)
                data1 = [item.cleantext for item in notnulls]
                notnullminhashs = processhelper.getMinHashs(data=data1)
                forest = processhelper.getMinhashforest2(
                    minhashs=notnullminhashs)
                nulls = dataset.filter(id__in=nullNodeRecords)
                data2 = [item.cleantext for item in nulls]
                nullsminhashs = processhelper.getMinHashs(data=data2)
                for j in range(len(nullNodeRecords)):
                    # Using m1 as the query, retrieve top 1 keys that have the higest Jaccard
                    result = forest.query(nullsminhashs[j], 5)
                    tempp = [1000000, 0]
                    count = count + 1
                    for notnullkey in result:
                        # jd = nullsminhashs[j].jaccard(notnullminhashs[notnullkey])
                        d3 = affinegap.normalizedAffineGapDistance(
                            data2[j], data1[notnullkey])
                        if d3 < tempp[0]:
                            tempp[0] = d3
                            tempp[1] = notnullkey

                    if tempp[0] < 3:
                        reid = notnullrecords[tempp[1]]
                        cluster_dict[nullNodeRecords[j]] = cluster_dict[reid]
                    else:
                        cluster_dict[nullNodeRecords[j]] = count
                        count = count + 1
                print('cluster size:')
                return cluster_dict
Esempio n. 7
0
def hbfClusterView(corahbf, dataset, acr_threshold, username, dis_threshold):
    cluster_dict = {}
    ordered_layers_dict = corahbf.get_orderlayers_dict()
    print(json.dumps(ordered_layers_dict))
    lendict = len(ordered_layers_dict)
    total_records = corahbf.get_dataset_records_ids()
    for k in range(lendict, 0, -1):
        layer = corahbf.get_single_layer(ordered_layers_dict[k])
        print(layer.get_layerName(), " at layer:", layer.get_level())
        # check the layer-ACR
        all_children_dict = layer.get_all_children()
        if 'NULL' in all_children_dict.keys():
            nullNode = layer.get_single_child('NULL')
            nullNodeRecords = nullNode.get_index_recordids()

            print('nullNodeRecords:', nullNodeRecords)
            layer_ACR = 1 - len(nullNodeRecords) / len(total_records)
        else:
            layer_ACR = 1
        # skip
        if layer_ACR < acr_threshold:
            continue
        else:
            count = 0
            ordered_nodes_dict = layer.get_ordered_nodes_dict()
            for order, value in ordered_nodes_dict.items():
                if value == 'NULL':
                    continue
                node = layer.get_single_child(value)
                # split
                nodeRecords = node.get_index_recordids()
                if len(nodeRecords) > 1:
                    subrecords = collectSubrecords(node=node,
                                                   dataset=dataset,
                                                   username=username)
                    subgroups = dextrapreclustering.affinegaphClustering(
                        data=subrecords)
                else:
                    subgroups = [1]
                for i in range(len(subgroups)):
                    cluster_dict[nodeRecords[i]] = subgroups[i] + count
                count = count + len(subgroups)

            # merge
            if nullNodeRecords:
                removej = []
                notnullrecords = [k for k in cluster_dict.keys()]
                for j in range(len(nullNodeRecords)):
                    tempmin = [100000000, 0]
                    # for i in range(len(cluster_dict)):
                    #     data = dataset.filter(id__in=[nodeRecords[i], nullNodeRecords[j]])
                    for i in notnullrecords:
                        data = dataset.filter(id__in=[i, nullNodeRecords[j]])
                        d3 = affinegap.normalizedAffineGapDistance(
                            data[0].cleantext, data[1].cleantext)
                        if d3 < tempmin[0]:
                            tempmin[0] = d3
                            tempmin[1] = i
                    if tempmin[0] < dis_threshold:
                        cluster_dict[nullNodeRecords[j]] = cluster_dict[i]
                        removej.append(nullNodeRecords[j])
                remaing = list(set(nullNodeRecords).difference(set(removej)))
                count = count + 1
                for id in remaing:
                    cluster_dict[id] = count
                    count = count = 1
    return cluster_dict