Esempio n. 1
0
def jaccard_score(rates1: SparseVector, rates2: SparseVector) -> Decimal:
    """Compute Jaccard similarity coefficient
        (https://en.wikipedia.org/wiki/Jaccard_index)

    This is comparing rates by each users.

    This will consider the rate value as well s.t. even if bothe are rated,
    it is not regarded as the same if the rate values are not the same.
    """
    r1 = rates1.toArray()
    r2 = rates2.toArray()
    union = int(sum((r1 == r2) * (r1 > 0)))
    intersection = int(sum((r1 + r2) > 0))
    return Decimal(union) / intersection
Esempio n. 2
0
def jaccard_score_binary(rates1: SparseVector,
                         rates2: SparseVector) -> Decimal:
    """Compute Jaccard similarity coefficient
        (https://en.wikipedia.org/wiki/Jaccard_index)

    This is comparing rates by each users.

    This will ignore the actual rate, and this assumes people who watch
    the same movies have similar preference, therefore those
    movies are similar.
    """
    # is there efficient way to handle sparse vector?
    r1 = rates1.toArray()
    r2 = rates2.toArray()
    union = int(sum(((r1 > 0) == (r2 > 0)) * (r1 > 0)))
    intersection = int(sum((r1 + r2) > 0))
    return Decimal(union) / intersection