def test_descending(): """ sort_dict() should sort an ordered dictionary in descending order. """ d = OrderedDict([('a', 1), ('b', 2), ('c', 3)]) desc = sort_dict(d) assert list(desc.items()) == [('c', 3), ('b', 2), ('a', 1)]
def term_counts(self): """ Get an ordered dictionary of term counts. """ counts = OrderedDict() for term in self.terms: counts[term] = len(self.terms[term]) return utils.sort_dict(counts)
def test_ascending(): """ When desc=False is passed, sort in ascending order. """ d = OrderedDict([('c', 3), ('b', 2), ('a', 1)]) asc = sort_dict(d, desc=False) assert list(asc.items()) == [('a', 1), ('b', 2), ('c', 3)]
def anchored_pairs(self, anchor): """ Compute the pairs between an anchor term and all other terms. :param anchor: The anchor term. """ pairs = OrderedDict() for term in self.terms: score = self.get_pair(anchor, term) if score: pairs[term] = score return utils.sort_dict(pairs)
def frequency_ratios(self): """ For each term, get the ratio between the number of times that the term occurs and the number of times that the most frequent term occurs. """ counts = self.term_counts() highest = float(np.amax(counts.values())) for term in self.terms: counts[term] /= highest return utils.sort_dict(counts)
def densities(self, **kwargs): """ For each term, compute a "density" score by multiplying the normalized height of the kernel density estimate by the frequency ratio. """ kms = self.normalized_kde_maxima(**kwargs) frs = self.frequency_ratios() densities = OrderedDict() for term in self.terms: densities[term] = kms[term] * frs[term] return utils.sort_dict(densities)
def normalized_kde_maxima(self, **kwargs): """ For each term, get the difference between the term's KDE max and the lowest KDE max of any term. """ maxima = OrderedDict() for term in self.terms: maxima[term] = self.kde_max(term, **kwargs) lowest = np.amin(maxima.values()) for term in self.terms: maxima[term] -= lowest return utils.sort_dict(maxima)
def anchored_scores(self, anchor, method='braycurtis', **kwargs): """ Compute the intersections between an anchor term and all other terms. :param anchor: The anchor term. :param method: The scoring function. """ evaluator = getattr(self, 'score_' + method) pairs = OrderedDict() for term in self.terms: pairs[term] = evaluator(anchor, term, **kwargs) return utils.sort_dict(pairs)
def anchored_scores(self, anchor, method='braycurtis', **kwargs): """ Compute the intersections between an anchor term and all other terms. :param anchor: The anchor term. :param method: The scoring function. """ evaluator = getattr(self, 'score_'+method) pairs = OrderedDict() for term in self.terms: pairs[term] = evaluator(anchor, term, **kwargs) return utils.sort_dict(pairs)
def anchored_pairs(self, anchor): """ Get distances between an anchor term and all other terms. Args: anchor (str): The anchor term. Returns: OrderedDict: The distances, in descending order. """ pairs = OrderedDict() for term in self.keys: score = self.get_pair(anchor, term) if score: pairs[term] = score return utils.sort_dict(pairs)
def topn_edit_distances(self, term_depth=500, n=10, **kwargs): """ For each term in text 1, find the term in text 2 with the most similar set of nearest-neighbors, in terms of path distance. Args: n (int): The number of neighbors to consider. Returns: list: Tuples of (t1 term, t2 term, distance) """ # Build graphs. g1 = self.text1.build_graph(term_depth=term_depth, **kwargs) g2 = self.text2.build_graph(term_depth=term_depth, **kwargs) # Get lengths between all pairs. dj1 = dijkstra(g1.graph, cutoff=n) dj2 = dijkstra(g2.graph, cutoff=n) # For each term in text 1. links = [] for s1, t1 in bar(dj1.items()): # Score against each term in text 2. scores = OrderedDict() for s2, t2 in dj2.items(): nn1 = list(t1.keys())[:n] nn2 = list(t2.keys())[:n] scores[s2] = editdistance.eval(nn1, nn2) # Get the closest neighbor. scores = sort_dict(scores, desc=False) winner = list(scores.items())[0] # Register the match. links.append((s1, winner[0], winner[1])) # Sort strongest -> weakest. links = sorted(links, key=lambda x: x[2]) return links
def test_ascending(): """ When desc=False is passed, sort in ascending order. """ d = OrderedDict([ ('c', 3), ('b', 2), ('a', 1) ]) asc = sort_dict(d, desc=False) assert list(asc.items()) == [ ('a', 1), ('b', 2), ('c', 3) ]
def test_descending(): """ sort_dict() should sort an ordered dictionary in descending order. """ d = OrderedDict([ ('a', 1), ('b', 2), ('c', 3) ]) desc = sort_dict(d) assert list(desc.items()) == [ ('c', 3), ('b', 2), ('a', 1) ]
def dijkstra(graph, cutoff): """ Compute the path distance between all nodes in a graph. Args: graph (nx.Graph) cutoff (int) Returns: dict: A map of node (str) -> neighbors (OrderedDict). """ # Get source -> target distances. distances = nx.all_pairs_dijkstra_path_length(graph, cutoff) # Order the targets by distance. for source, targets in distances.items(): distances[source] = sort_dict(targets, desc=False) return distances