Example #1
0
def test_descending():
    """
    sort_dict() should sort an ordered dictionary in descending order.
    """

    d = OrderedDict([('a', 1), ('b', 2), ('c', 3)])

    desc = sort_dict(d)

    assert list(desc.items()) == [('c', 3), ('b', 2), ('a', 1)]
Example #2
0
    def term_counts(self):
        """
        Get an ordered dictionary of term counts.
        """

        counts = OrderedDict()
        for term in self.terms:
            counts[term] = len(self.terms[term])

        return utils.sort_dict(counts)
Example #3
0
def test_ascending():
    """
    When desc=False is passed, sort in ascending order.
    """

    d = OrderedDict([('c', 3), ('b', 2), ('a', 1)])

    asc = sort_dict(d, desc=False)

    assert list(asc.items()) == [('a', 1), ('b', 2), ('c', 3)]
Example #4
0
    def term_counts(self):

        """
        Get an ordered dictionary of term counts.
        """

        counts = OrderedDict()
        for term in self.terms:
            counts[term] = len(self.terms[term])

        return utils.sort_dict(counts)
Example #5
0
    def anchored_pairs(self, anchor):
        """
        Compute the pairs between an anchor term and all other terms.

        :param anchor: The anchor term.
        """

        pairs = OrderedDict()
        for term in self.terms:
            score = self.get_pair(anchor, term)
            if score: pairs[term] = score

        return utils.sort_dict(pairs)
Example #6
0
    def frequency_ratios(self):
        """
        For each term, get the ratio between the number of times that the term
        occurs and the number of times that the most frequent term occurs.
        """

        counts = self.term_counts()

        highest = float(np.amax(counts.values()))
        for term in self.terms:
            counts[term] /= highest

        return utils.sort_dict(counts)
Example #7
0
    def densities(self, **kwargs):
        """
        For each term, compute a "density" score by multiplying the normalized
        height of the kernel density estimate by the frequency ratio.
        """

        kms = self.normalized_kde_maxima(**kwargs)
        frs = self.frequency_ratios()

        densities = OrderedDict()
        for term in self.terms:
            densities[term] = kms[term] * frs[term]

        return utils.sort_dict(densities)
Example #8
0
    def frequency_ratios(self):

        """
        For each term, get the ratio between the number of times that the term
        occurs and the number of times that the most frequent term occurs.
        """

        counts = self.term_counts()

        highest = float(np.amax(counts.values()))
        for term in self.terms:
            counts[term] /= highest

        return utils.sort_dict(counts)
Example #9
0
    def normalized_kde_maxima(self, **kwargs):
        """
        For each term, get the difference between the term's KDE max and the
        lowest KDE max of any term.
        """

        maxima = OrderedDict()
        for term in self.terms:
            maxima[term] = self.kde_max(term, **kwargs)

        lowest = np.amin(maxima.values())
        for term in self.terms:
            maxima[term] -= lowest

        return utils.sort_dict(maxima)
Example #10
0
    def anchored_scores(self, anchor, method='braycurtis', **kwargs):
        """
        Compute the intersections between an anchor term and all other terms.

        :param anchor: The anchor term.
        :param method: The scoring function.
        """

        evaluator = getattr(self, 'score_' + method)

        pairs = OrderedDict()
        for term in self.terms:
            pairs[term] = evaluator(anchor, term, **kwargs)

        return utils.sort_dict(pairs)
Example #11
0
    def densities(self, **kwargs):

        """
        For each term, compute a "density" score by multiplying the normalized
        height of the kernel density estimate by the frequency ratio.
        """

        kms = self.normalized_kde_maxima(**kwargs)
        frs = self.frequency_ratios()

        densities = OrderedDict()
        for term in self.terms:
            densities[term] = kms[term] * frs[term]

        return utils.sort_dict(densities)
Example #12
0
    def normalized_kde_maxima(self, **kwargs):

        """
        For each term, get the difference between the term's KDE max and the
        lowest KDE max of any term.
        """

        maxima = OrderedDict()
        for term in self.terms:
            maxima[term] = self.kde_max(term, **kwargs)

        lowest = np.amin(maxima.values())
        for term in self.terms:
            maxima[term] -= lowest

        return utils.sort_dict(maxima)
Example #13
0
    def anchored_scores(self, anchor, method='braycurtis', **kwargs):

        """
        Compute the intersections between an anchor term and all other terms.

        :param anchor: The anchor term.
        :param method: The scoring function.
        """

        evaluator = getattr(self, 'score_'+method)

        pairs = OrderedDict()
        for term in self.terms:
            pairs[term] = evaluator(anchor, term, **kwargs)

        return utils.sort_dict(pairs)
Example #14
0
    def anchored_pairs(self, anchor):
        """
        Get distances between an anchor term and all other terms.

        Args:
            anchor (str): The anchor term.

        Returns:
            OrderedDict: The distances, in descending order.
        """

        pairs = OrderedDict()

        for term in self.keys:
            score = self.get_pair(anchor, term)
            if score: pairs[term] = score

        return utils.sort_dict(pairs)
Example #15
0
    def topn_edit_distances(self, term_depth=500, n=10, **kwargs):

        """
        For each term in text 1, find the term in text 2 with the most similar
        set of nearest-neighbors, in terms of path distance.

        Args:
            n (int): The number of neighbors to consider.

        Returns:
            list: Tuples of (t1 term, t2 term, distance)
        """

        # Build graphs.
        g1 = self.text1.build_graph(term_depth=term_depth, **kwargs)
        g2 = self.text2.build_graph(term_depth=term_depth, **kwargs)

        # Get lengths between all pairs.
        dj1 = dijkstra(g1.graph, cutoff=n)
        dj2 = dijkstra(g2.graph, cutoff=n)

        # For each term in text 1.
        links = []
        for s1, t1 in bar(dj1.items()):

            # Score against each term in text 2.
            scores = OrderedDict()
            for s2, t2 in dj2.items():

                nn1 = list(t1.keys())[:n]
                nn2 = list(t2.keys())[:n]
                scores[s2] = editdistance.eval(nn1, nn2)

            # Get the closest neighbor.
            scores = sort_dict(scores, desc=False)
            winner = list(scores.items())[0]

            # Register the match.
            links.append((s1, winner[0], winner[1]))

        # Sort strongest -> weakest.
        links = sorted(links, key=lambda x: x[2])

        return links
Example #16
0
def test_ascending():

    """
    When desc=False is passed, sort in ascending order.
    """

    d = OrderedDict([
        ('c', 3),
        ('b', 2),
        ('a', 1)
    ])

    asc = sort_dict(d, desc=False)

    assert list(asc.items()) == [
        ('a', 1),
        ('b', 2),
        ('c', 3)
    ]
Example #17
0
def test_descending():

    """
    sort_dict() should sort an ordered dictionary in descending order.
    """

    d = OrderedDict([
        ('a', 1),
        ('b', 2),
        ('c', 3)
    ])

    desc = sort_dict(d)

    assert list(desc.items()) == [
        ('c', 3),
        ('b', 2),
        ('a', 1)
    ]
Example #18
0
    def anchored_pairs(self, anchor):

        """
        Get distances between an anchor term and all other terms.

        Args:
            anchor (str): The anchor term.

        Returns:
            OrderedDict: The distances, in descending order.
        """

        pairs = OrderedDict()

        for term in self.keys:
            score = self.get_pair(anchor, term)
            if score: pairs[term] = score

        return utils.sort_dict(pairs)
Example #19
0
def dijkstra(graph, cutoff):

    """
    Compute the path distance between all nodes in a graph.

    Args:
        graph (nx.Graph)
        cutoff (int)

    Returns:
        dict: A map of node (str) -> neighbors (OrderedDict).
    """

    # Get source -> target distances.
    distances = nx.all_pairs_dijkstra_path_length(graph, cutoff)

    # Order the targets by distance.
    for source, targets in distances.items():
        distances[source] = sort_dict(targets, desc=False)

    return distances