def gapped_edit_dist(seq1_str, seq2_str): d1 = ag.normalizedAffineGapDistance(seq1_str, seq2_str, matchWeight=0, mismatchWeight=1, gapWeight=1, spaceWeight=5, abbreviation_scale=1) return d1
def affinegaphClustering(data): # distance matric print(data) dis = [] for n in range(len(data)): row = [] for i in range(0, len(data)): d3 = affinegap.normalizedAffineGapDistance(data[n], data[i]) row.append(d3) dis.append(row) print(dis) linkage = fastcluster.linkage(dis, method="complete") b = fcluster(linkage, t=0.99, criterion='inconsistent') print(b) return b
def pairwise_affine_gap_distance(x1, x2): """Calculate normalized affine gap distance between two samples x1 and x2. Parameters ---------- x1: sample 1, array [n_features] x2: sample 2, array [n_features] Returns ------- agap_distance: float, normalized affine gap distance of a given pair """ agap_distance = 0 assert (len(x1) == len(x2)) for i in range(len(x1)): str1 = str(x1[i]) if x1[i] else '' str2 = str(x2[i]) if x2[i] else '' if not str1 and not str2: agap_distance += 0.5 else: agap_distance += normalizedAffineGapDistance(str1, str2) return agap_distance
def affinegapFastClustering(clusternum,data): # distance matric dis = [] for n in range(len(data)): row = [] for i in range(0, len(data)): d3 = affinegap.normalizedAffineGapDistance(data[n], data[i]) row.append(d3) dis.append(row) linkage = fastcluster.linkage(dis, method="complete") # clusternum = 2 clustdict = {i: [i] for i in range(len(linkage) + 1)} for i in range(len(linkage) - clusternum + 1): clust1 = int(linkage[i][0]) clust2 = int(linkage[i][1]) clustdict[max(clustdict) + 1] = clustdict[clust1] + clustdict[clust2] del clustdict[clust1], clustdict[clust2] print(clustdict) return clustdict
def dedupe_affine_gap(s1, s2): return pd.Series(list(zip(s1, s2))).apply(lambda x: normalizedAffineGapDistance(x[0], x[1]))
def hbfClusterViewComplexMinhashaffinegap(corahbf, dataset, acr_threshold, username, dis_threshold): cluster_dict = {} ordered_layers_dict = corahbf.get_orderlayers_dict() print(json.dumps(ordered_layers_dict)) lendict = len(ordered_layers_dict) total_records = corahbf.get_dataset_records_ids() for k in range(lendict, 0, -1): layer = corahbf.get_single_layer(ordered_layers_dict[k]) print(layer.get_layerName(), " at layer:", layer.get_level()) # check the layer-ACR all_children_dict = layer.get_all_children() if 'NULL' in all_children_dict.keys(): nullNode = layer.get_single_child('NULL') nullNodeRecords = nullNode.get_index_recordids() print('nullNodeRecords:', nullNodeRecords) layer_ACR = 1 - len(nullNodeRecords) / len(total_records) else: layer_ACR = 1 # skip if layer_ACR < acr_threshold: continue else: count = 0 ordered_nodes_dict = layer.get_ordered_nodes_dict() for order, value in ordered_nodes_dict.items(): if value == 'NULL': continue node = layer.get_single_child(value) # print('start spliting') nodeRecords = node.get_index_recordids() if len(nodeRecords) > 20: print('start collect subrecords') subrecords = collectSubrecordsSimple(node=node, dataset=dataset, username=username) print('start hclustering') clustdict = dextrapreclustering.affinegapSimpleClustering( data=subrecords) cluster_membership = {} for k, v in clustdict.items(): for d in v: cluster_membership[d] = k subgroups = [ cluster_membership[i] for i in range(len(nodeRecords)) ] else: subgroups = [1 for k in nodeRecords] for i in range(len(subgroups)): cluster_dict[nodeRecords[i]] = subgroups[i] + count count = count + len(subgroups) # merge print('start merging-------') if nullNodeRecords: notnullrecords = [k for k in cluster_dict.keys()] notnulls = dataset.filter(id__in=notnullrecords) data1 = [item.cleantext for item in notnulls] notnullminhashs = processhelper.getMinHashs(data=data1) forest = processhelper.getMinhashforest2( minhashs=notnullminhashs) nulls = dataset.filter(id__in=nullNodeRecords) data2 = [item.cleantext for item in nulls] nullsminhashs = processhelper.getMinHashs(data=data2) for j in range(len(nullNodeRecords)): # Using m1 as the query, retrieve top 1 keys that have the higest Jaccard result = forest.query(nullsminhashs[j], 5) tempp = [1000000, 0] count = count + 1 for notnullkey in result: # jd = nullsminhashs[j].jaccard(notnullminhashs[notnullkey]) d3 = affinegap.normalizedAffineGapDistance( data2[j], data1[notnullkey]) if d3 < tempp[0]: tempp[0] = d3 tempp[1] = notnullkey if tempp[0] < 3: reid = notnullrecords[tempp[1]] cluster_dict[nullNodeRecords[j]] = cluster_dict[reid] else: cluster_dict[nullNodeRecords[j]] = count count = count + 1 print('cluster size:') return cluster_dict
def hbfClusterView(corahbf, dataset, acr_threshold, username, dis_threshold): cluster_dict = {} ordered_layers_dict = corahbf.get_orderlayers_dict() print(json.dumps(ordered_layers_dict)) lendict = len(ordered_layers_dict) total_records = corahbf.get_dataset_records_ids() for k in range(lendict, 0, -1): layer = corahbf.get_single_layer(ordered_layers_dict[k]) print(layer.get_layerName(), " at layer:", layer.get_level()) # check the layer-ACR all_children_dict = layer.get_all_children() if 'NULL' in all_children_dict.keys(): nullNode = layer.get_single_child('NULL') nullNodeRecords = nullNode.get_index_recordids() print('nullNodeRecords:', nullNodeRecords) layer_ACR = 1 - len(nullNodeRecords) / len(total_records) else: layer_ACR = 1 # skip if layer_ACR < acr_threshold: continue else: count = 0 ordered_nodes_dict = layer.get_ordered_nodes_dict() for order, value in ordered_nodes_dict.items(): if value == 'NULL': continue node = layer.get_single_child(value) # split nodeRecords = node.get_index_recordids() if len(nodeRecords) > 1: subrecords = collectSubrecords(node=node, dataset=dataset, username=username) subgroups = dextrapreclustering.affinegaphClustering( data=subrecords) else: subgroups = [1] for i in range(len(subgroups)): cluster_dict[nodeRecords[i]] = subgroups[i] + count count = count + len(subgroups) # merge if nullNodeRecords: removej = [] notnullrecords = [k for k in cluster_dict.keys()] for j in range(len(nullNodeRecords)): tempmin = [100000000, 0] # for i in range(len(cluster_dict)): # data = dataset.filter(id__in=[nodeRecords[i], nullNodeRecords[j]]) for i in notnullrecords: data = dataset.filter(id__in=[i, nullNodeRecords[j]]) d3 = affinegap.normalizedAffineGapDistance( data[0].cleantext, data[1].cleantext) if d3 < tempmin[0]: tempmin[0] = d3 tempmin[1] = i if tempmin[0] < dis_threshold: cluster_dict[nullNodeRecords[j]] = cluster_dict[i] removej.append(nullNodeRecords[j]) remaing = list(set(nullNodeRecords).difference(set(removej))) count = count + 1 for id in remaing: cluster_dict[id] = count count = count = 1 return cluster_dict