def string_disambiguation(value, candidates, name=False): def get_label(x): return x.split('resource')[1][1:].replace('_', ' ') if name: best_dist, best_match = (-1, -float('inf')), None for entity in candidates: label = get_label(entity) name_sim = who.ratio(label, value) neg_lev_dist = -levenshtein(value.lower(), label.lower()) dist = (name_sim, neg_lev_dist) if dist[1] <= 0 and dist > best_dist: best_dist = dist best_match = entity else: distances = [] best_dist, best_match = 999999, None for entity in candidates: label = get_label(entity) dist = levenshtein(value.lower(), label.lower(), best_dist) if 0 <= dist < best_dist: best_dist = dist best_match = entity return best_match
def get_variations(text: str) -> set: """Returns all synonyms of a text having an edit-distance of 2 or less.""" text = text.replace(' ', '_') return { s.replace('_', ' ') for s in words_util.get_synonyms(text) if levenshtein(s, text, 2) <= 2 }
def levenshtein_similarity(string_a, string_b): ''' Function returns Levenshtein similarity between two strings. Levenshtein distance is defined as the smallest number of edit operations (insertion, deletion, and substitution) required to transform one string into another. string_a = 'the fast brown fox' string_b = 'the slow brown fox' Levenshtein distance = 4 This is converted into a similarity value by using the following formula: 1-levenshtein distance/max(string1, string2) levenshtein_similarity = 1 - 4/18 = 0.778 Parameters: string_a : first string (text) string_b : second string (text) Returns: float: levenshtein_similarity * 100 ''' # Calculate the Levenshtein distance using polyleven (Myers algorithm) levenshtein_distance = levenshtein(string_a, string_b) # Calculate the Levenshtein similarity levenshtein_sim = ( 1 - levenshtein_distance / max(len(string_a), len(string_b))) * 100 return levenshtein_sim
def _add_list_to_graph(self, lst: str, lst_name: str, list_graph: ListGraph) -> str: """Add a list as new node to the graph.""" node_id = self._convert_to_clg_type(lst_name) if not self.has_node(node_id): # If node_id is not in the graph, then we try synonyms with max. edit-distance of 2 # e.g. to cover cases where the type is named 'Organisation' and the category 'Organization' for name_variation in hypernymy_util.get_variations(lst_name): node_id_alternative = clg_util.name2clg_type(name_variation) if self.has_node(node_id_alternative): node_id = node_id_alternative break node_parts = list_graph.get_parts(lst) # check for equivalent mapping and existing node_id (if they map to more than one node -> log error) equivalent_nodes = { node for eq_cat in list_mapping.get_equivalent_categories(lst) for node in self.get_nodes_for_part(eq_cat) } if self.has_node(node_id): equivalent_nodes.add(node_id) if len(equivalent_nodes) > 1: utils.get_logger().debug( f'CaLiGraph: ListMerge - For "{lst}" multiple equivalent nodes have been found: {equivalent_nodes}.' ) equivalent_nodes = { node_id } if node_id in equivalent_nodes else equivalent_nodes if equivalent_nodes: main_node_id = sorted(equivalent_nodes, key=lambda x: levenshtein(x, node_id))[0] self._set_parts(main_node_id, self.get_parts(main_node_id) | node_parts) return main_node_id # check for parents to initialise under (parent mapping) self._add_nodes({node_id}) self._set_name(node_id, lst_name) self._set_parts(node_id, node_parts) parent_nodes = { node for parent_cat in list_mapping.get_parent_categories(lst) for node in self.get_nodes_for_part(parent_cat) } self._add_edges({(pn, node_id) for pn in parent_nodes}) return node_id
def remove_below_edit_dist(barcode_list, n=3, randomize=True): """remove barcodes below a set levenshtein edit-distance n""" min_dist = range(n) # set the minimum distance if randomize: random.shuffle(barcode_list) # shuffle barcodes (to avoid always starting with the same barcode) filtered = {barcode_list[0]} # set first barcode as 'seed' barcode_list = set(barcode_list) # convert barcodes to set (for performance reasons) for i in barcode_list: broken = 0 for j in filtered: current_dist = levenshtein(i, j, n - 1) # calculate distance to every barcode added so far if current_dist in min_dist: # only use 'acceptable' barcodes broken = 1 # stop looking if a 'match' is found break if broken == 0: filtered.add(i) return list(filtered)
def query_index(seq, barcode_index, num_mismatches=1): """Performs a fuzzy barcode search, exhaustive. Parameters ---------- seq : str A DNA string. barcode_index : dict A FastSS index of barcodes. num_mismatches : int, optional Maximum levenshtein distance allowd. Returns ------- dict A dictionary of fuzzy searching result. Keys are levenshtein distance. Values are list of matched barcodes. """ res = {d: [] for d in range(num_mismatches + 1)} cands = {barcode_index.get(key) for key in indexkeys(seq, num_mismatches)} # cands.discard(None) # cands = {i for i in cands if i} if seq in cands: res[0].append(seq) else: for cand in cands: if cand: dist = levenshtein(seq, cand, num_mismatches) if dist <= num_mismatches: res[dist].append(cand) return res
def generate_network(self: Union[Dandelion, pd.DataFrame, str], key: Union[None, str] = None, clone_key: Union[None, str] = None, min_size: int = 2, downsample: Union[None, int] = None, verbose: bool = True, locus: Union[None, Literal['ig', 'tr-ab', 'tr-gd']] = None, **kwargs) -> Dandelion: """ Generates a Levenshtein distance network based on full length VDJ sequence alignments for heavy and light chain(s). The distance matrices are then combined into a singular matrix. Parameters ---------- data : Dandelion, DataFrame, str `Dandelion` object, pandas `DataFrame` in changeo/airr format, or file path to changeo/airr file after clones have been determined. key : str, optional column name for distance calulations. None defaults to 'sequence_alignment_aa'. clone_key: str, optional column name to build network on. min_size : int For visualization purposes, two graphs are created where one contains all cells and a trimmed second graph. This value specifies the minimum number of edges required otherwise node will be trimmed in the secondary graph. downsample : int, optional whether or not to downsample the number of cells prior to construction of network. If provided, cells will be randomly sampled to the integer provided. A new Dandelion class will be returned. verbose : bool whether or not to print the progress bars. locus : str, optional Mode of data. Accepts one of 'ig', 'tr-ab' or 'tr-gd'. None defaults to 'ig'. **kwargs additional kwargs passed to options specified in `networkx.drawing.layout.spring_layout`. Returns ------- `Dandelion` object with `.distance`, `.edges`, `.layout`, `.graph` initialized. """ if verbose: start = logg.info('Generating network') if self.__class__ == Dandelion: dat = load_data(self.data) else: dat = load_data(self) if key is None: key_ = 'sequence_alignment_aa' # default else: key_ = key if key_ not in dat: raise ValueError("key {} not found in input table.".format(key_)) if clone_key is None: clonekey = 'clone_id' else: clonekey = clone_key if clonekey not in dat: raise ValueError( 'Data does not contain clone information. Please run find_clones.') if locus is None: locus = 'ig' dat = sanitize_data(dat, ignore=clonekey) # calculate distance if downsample is not None: # if downsample >= dat_h.shape[0]: if downsample >= self.metadata.shape[0]: if verbose: print('Cannot downsample to {} cells. Using all {} cells.'. format(str(downsample), self.metadata.shape[0])) else: if verbose: print('Downsampling to {} cells.'.format(str(downsample))) dat_h = dat[dat['locus'].isin(['IGH', 'TRB', 'TRD'])].copy() dat_l = dat[dat['locus'].isin(['IGK', 'IGL', 'TRA', 'TRG'])].copy() dat_h = dat_h.sample(downsample) dat_l = dat_l[dat_l['cell_id'].isin(list(dat_h['cell_id']))].copy() dat_ = dat_h.append(dat_l) dat_ = sanitize_data(dat_, ignore=clonekey) else: dat_ = dat.copy() # So first, create a data frame to hold all possible (full) sequences split by # heavy (only 1 possible for now) and light (multiple possible) try: dat_seq = retrieve_metadata(dat_, query=key_, split=True, collapse=False, locus=locus, ignore=clonekey) except: dat_seq = retrieve_metadata(dat_, query=key_, split=True, collapse=False, locus=locus, ignore=clonekey, **kwargs) dat_seq.columns = [re.sub(key_ + '_', '', i) for i in dat_seq.columns] # calculate a distance matrix for all vs all and this can be referenced later on to # extract the distance between the right pairs dmat = Tree() sleep(0.5) if verbose: for x in tqdm(dat_seq.columns, desc='Calculating distances... '): tdarray = np.array(np.array(dat_seq[x])).reshape(-1, 1) # d_mat = squareform([levenshtein(x[0],y[0]) for x,y in combinations(tdarray, 2) # if (x[0] == x[0]) and (y[0] == y[0]) else 0]) d_mat = squareform( pdist( tdarray, lambda x, y: levenshtein(x[0], y[0]) if (x[0] == x[0]) and (y[0] == y[0]) else 0)) dmat[x] = d_mat else: for x in dat_seq.columns: tdarray = np.array(np.array(dat_seq[x])).reshape(-1, 1) # d_mat = squareform([levenshtein(x[0],y[0]) for x,y in combinations(tdarray, 2) # if (x[0] == x[0]) and (y[0] == y[0]) else 0]) d_mat = squareform( pdist( tdarray, lambda x, y: levenshtein(x[0], y[0]) if (x[0] == x[0]) and (y[0] == y[0]) else 0)) dmat[x] = d_mat dist_mat_list = [dmat[x] for x in dmat if type(dmat[x]) is np.ndarray] total_dist = np.sum(dist_mat_list, axis=0) # generate edge list if self.__class__ == Dandelion: out = self.copy() if downsample is not None: out = Dandelion(dat_, locus=locus) else: # re-initiate a Dandelion class object out = Dandelion(dat_, locus=locus) tmp_totaldist = pd.DataFrame(total_dist, index=dat_seq.index, columns=dat_seq.index) tmp_clusterdist = Tree() overlap = [] for i in out.metadata.index: if len(out.metadata.loc[i, str(clonekey)].split('|')) > 1: overlap.append( [c for c in out.metadata.loc[i, str(clonekey)].split('|')]) for c in out.metadata.loc[i, str(clonekey)].split('|'): tmp_clusterdist[c][i].value = 1 else: cx = out.metadata.loc[i, str(clonekey)] tmp_clusterdist[cx][i].value = 1 tmp_clusterdist2 = {} for x in tmp_clusterdist: tmp_clusterdist2[x] = list(tmp_clusterdist[x]) cluster_dist = {} for c_ in tmp_clusterdist2: if c_ in list(flatten(overlap)): for ol in overlap: if c_ in ol: idx = list( set(flatten([tmp_clusterdist2[c_x] for c_x in ol]))) if len(list(set(idx))) > 1: dist_mat_ = tmp_totaldist.loc[idx, idx] s1, s2 = dist_mat_.shape if s1 > 1 and s2 > 1: cluster_dist['|'.join(ol)] = dist_mat_ else: dist_mat_ = tmp_totaldist.loc[tmp_clusterdist2[c_], tmp_clusterdist2[c_]] s1, s2 = dist_mat_.shape if s1 > 1 and s2 > 1: cluster_dist[c_] = dist_mat_ # to improve the visulisation and plotting efficiency, i will build a minimum spanning tree for each group/clone to connect the shortest path mst_tree = mst(cluster_dist) sleep(0.5) edge_list = Tree() if verbose: for c in tqdm(mst_tree, desc='Generating edge list '): G = nx.from_pandas_adjacency(mst_tree[c]) edge_list[c] = nx.to_pandas_edgelist(G) else: for c in mst_tree: G = nx.from_pandas_adjacency(mst_tree[c]) edge_list[c] = nx.to_pandas_edgelist(G) sleep(0.5) clone_ref = dict(out.metadata[clonekey]) tmp_clone_tree = Tree() for x in out.metadata.index: if '|' in clone_ref[x]: for x_ in clone_ref[x].split('|'): tmp_clone_tree[x_][x].value = 1 else: tmp_clone_tree[clone_ref[x]][x].value = 1 tmp_clone_tree2 = Tree() for x in tmp_clone_tree: tmp_clone_tree2[x] = list(tmp_clone_tree[x]) tmp_clone_tree3 = Tree() tmp_clone_tree3_overlap = Tree() for x in tmp_clone_tree2: # this is to catch all possible cells that may potentially match up with this clone that's joined together if x in list(flatten(overlap)): for ol in overlap: if x in ol: if len(tmp_clone_tree2[x]) > 1: for x_ in tmp_clone_tree2[x]: tmp_clone_tree3_overlap['|'.join(ol)][''.join( x_)].value = 1 else: tmp_clone_tree3_overlap['|'.join(ol)][''.join( tmp_clone_tree2[x])].value = 1 else: tmp_ = pd.DataFrame(index=tmp_clone_tree2[x], columns=tmp_clone_tree2[x]) tmp_ = pd.DataFrame(np.tril(tmp_) + 1, index=tmp_clone_tree2[x], columns=tmp_clone_tree2[x]) tmp_.fillna(0, inplace=True) tmp_clone_tree3[x] = tmp_ for x in tmp_clone_tree3_overlap: # repeat for the overlap clones tmp_ = pd.DataFrame(index=tmp_clone_tree3_overlap[x], columns=tmp_clone_tree3_overlap[x]) tmp_ = pd.DataFrame(np.tril(tmp_) + 1, index=tmp_clone_tree3_overlap[x], columns=tmp_clone_tree3_overlap[x]) tmp_.fillna(0, inplace=True) tmp_clone_tree3[x] = tmp_ # here I'm using a temporary edge list to catch all cells that were identified as clones to forcefully link them up if they were identical but clipped off during the mst step # create a dataframe to recall the actual distance quickly tmp_totaldiststack = pd.DataFrame(tmp_totaldist.unstack()) tmp_totaldiststack.index.names = [None, None] tmp_totaldiststack = tmp_totaldiststack.reset_index(drop=False) tmp_totaldiststack.columns = ['source', 'target', 'weight'] tmp_totaldiststack.index = [ str(s) + '|' + str(t) for s, t in zip(tmp_totaldiststack['source'], tmp_totaldiststack['target']) ] tmp_totaldiststack['keep'] = [ False if len(list(set(i.split('|')))) == 1 else True for i in tmp_totaldiststack.index ] tmp_totaldiststack = tmp_totaldiststack[tmp_totaldiststack.keep].drop( 'keep', axis=1) tmp_edge_list = Tree() if verbose: for c in tqdm(tmp_clone_tree3, desc='Linking edges '): if len(tmp_clone_tree3[c]) > 1: G = nx.from_pandas_adjacency(tmp_clone_tree3[c]) tmp_edge_list[c] = nx.to_pandas_edgelist(G) tmp_edge_list[c].index = [ str(s) + '|' + str(t) for s, t in zip( tmp_edge_list[c]['source'], tmp_edge_list[c]['target']) ] tmp_edge_list[c]['weight'].update(tmp_totaldiststack['weight']) # keep only edges when there is 100% identity, to minimise crowding tmp_edge_list[c] = tmp_edge_list[c][tmp_edge_list[c]['weight'] == 0] tmp_edge_list[c].reset_index(inplace=True) else: for c in tmp_clone_tree3: if len(tmp_clone_tree3[c]) > 1: G = nx.from_pandas_adjacency(tmp_clone_tree3[c]) tmp_edge_list[c] = nx.to_pandas_edgelist(G) tmp_edge_list[c].index = [ str(s) + '|' + str(t) for s, t in zip( tmp_edge_list[c]['source'], tmp_edge_list[c]['target']) ] tmp_edge_list[c]['weight'].update(tmp_totaldiststack['weight']) # keep only edges when there is 100% identity, to minimise crowding tmp_edge_list[c] = tmp_edge_list[c][tmp_edge_list[c]['weight'] == 0] tmp_edge_list[c].reset_index(inplace=True) # try to catch situations where there's no edge (only singletons) try: edge_listx = pd.concat([edge_list[x] for x in edge_list]) edge_listx.index = [ str(s) + '|' + str(t) for s, t in zip(edge_listx['source'], edge_listx['target']) ] tmp_edge_listx = pd.concat([tmp_edge_list[x] for x in tmp_edge_list]) tmp_edge_listx.drop('index', axis=1, inplace=True) tmp_edge_listx.index = [ str(s) + '|' + str(t) for s, t in zip(tmp_edge_listx['source'], tmp_edge_listx['target']) ] edge_list_final = edge_listx.combine_first(tmp_edge_listx) edge_list_final['weight'].update(tmp_totaldiststack['weight']) # return the edge list edge_list_final.reset_index(drop=True, inplace=True) except: edge_list_final = None # and finally the vertex list which is super easy vertice_list = list(out.metadata.index) sleep(0.5) # and now to actually generate the network g, g_, lyt, lyt_ = generate_layout(vertice_list, edge_list_final, min_size=min_size, weight=None, verbose=verbose, **kwargs) # convert distance matrices to sparse for x in dmat: if type(dmat[x]) is np.ndarray: dmat[x] = csr_matrix(dmat[x]) if verbose: logg.info( ' finished', time=start, deep=('Updated Dandelion object: \n' ' \'data\', contig-indexed clone table\n' ' \'metadata\', cell-indexed clone table\n' ' \'distance\', heavy and light chain distance matrices\n' ' \'edges\', network edges\n' ' \'layout\', network layout\n' ' \'graph\', network')) if self.__class__ == Dandelion: if self.germline is not None: germline_ = self.germline else: germline_ = None if self.threshold is not None: threshold_ = self.threshold else: threshold_ = None if downsample is not None: # out = Dandelion(data = dat_downsample, metadata = downsample_meta, distance = dmat, edges = edge_list_final, layout = (lyt, lyt_), graph = (g, g_), germline = germline_) out = Dandelion(data=dat_, distance=dmat, edges=edge_list_final, layout=(lyt, lyt_), graph=(g, g_), germline=germline_, locus=locus) out.threshold = threshold_ return (out) else: self.__init__(data=self.data, metadata=self.metadata, distance=dmat, edges=edge_list_final, layout=(lyt, lyt_), graph=(g, g_), germline=germline_, locus=locus, initialize=False) self.threshold = threshold_ else: # out = Dandelion(data = dat, distance = dmat, edges = edge_list_final, layout = (lyt, lyt_), graph = (g, g_), clone_key = clone_key) out = Dandelion(data=dat_, distance=dmat, edges=edge_list_final, layout=(lyt, lyt_), graph=(g, g_), clone_key=clone_key, locus=locus) return (out)
def resolve_spelling_redirect(dbp_resource: str) -> str: redirect = resolve_redirect(dbp_resource) if levenshtein(dbp_resource, redirect, 2) > 2: # return original resource if the redirect links to a completely different resource return dbp_resource return redirect
def test_special(self): s = (chr(127) + chr(255)) * 33 self.assertEqual(0, levenshtein(s, s))
def test_unicode_with_k(self): for k in (0, 1, 2, 3): for (dist, s1, s2) in TEST_UNICODE: with self.subTest(k=k, s1=s1, s2=s2): self.assertEqual(min(dist, k + 1), levenshtein(s1, s2, k))
def test_unicode(self): for (dist, s1, s2) in TEST_UNICODE: with self.subTest(s1=s1, s2=s2): self.assertEqual(dist, levenshtein(s1, s2))
def test_long(self): for (dist, s1, s2) in TEST_LONG: with self.subTest(s1=s1, s2=s2): self.assertEqual(dist, levenshtein(s1, s2))
def test_ascii_with_k(self): for k in (0, 1, 2, 3): for (dist, s1, s2) in TEST_ASCII: with self.subTest(k=k, s1=s1, s2=s2): self.assertEqual(min(dist, k + 1), levenshtein(s1, s2, k))
def test_ascii(self): for (dist, s1, s2) in TEST_ASCII: with self.subTest(s1=s1, s2=s2): self.assertEqual(dist, levenshtein(s1, s2))