Beispiel #1
0
def irbo(topics, weight=0.9, topk=10):
    """
    compute the inverted rank-biased overlap

    Parameters
    ----------
    topics: a list of lists of words
    weight: p (float), default 1.0: Weight of each
        agreement at depth d:p**(d-1). When set
        to 1.0, there is no weight, the rbo returns
        to average overlap.
    topk: top k words on which the topic diversity
          will be computed

    Returns
    -------
    irbo : score of the rank biased overlap over the topics
    """
    if topk > len(topics[0]):
        raise Exception('Words in topics are less than topk')
    else:
        collect = []
        for list1, list2 in combinations(topics, 2):
            word2index = get_word2index(list1, list2)
            indexed_list1 = [word2index[word] for word in list1]
            indexed_list2 = [word2index[word] for word in list2]
            rbo_val = rbo(indexed_list1[:topk], indexed_list2[:topk], p=weight)[2]
            collect.append(rbo_val)
        return 1 - np.mean(collect)
Beispiel #2
0
def calc_rbo(a, b, p, count):
    scores = rbo(a, b, p)
    min = scores['min']
    res = scores['res']
    ext = scores['ext']

    Z = ext / sqrt((2 * (2 * count + 5)) / (9 * count * (count - 1)))

    p_value = scipy.stats.norm.sf(abs(Z)) * 2

    return ext, p_value
def compute_rank_biased_overlap(ranking_A, ranking_B, rbo_parameter=0.99):
    rbo_output = rbo(ranking_A, ranking_B, p=rbo_parameter)

    rbo_lower_bound = rbo_output.min
    rbo_residual = rbo_output.res
    rbo_estimate = rbo_output.ext

    reference_overlap = average_overlap(ranking_A, ranking_B)

    print('Rank-biased overlap estimate: {:.4f}'.format(rbo_estimate))
    print('Average overlap = {:.4f}'.format(reference_overlap))

    return rbo_estimate, reference_overlap
Beispiel #4
0
def main():
    args = parse_args()
    p = args.p

    lines1 = args.file1.readlines()
    lines2 = args.file2.readlines()

    run1 = parse_run(lines1)
    run2 = parse_run(lines2)

    queries = list(run1.keys() & run2.keys())
    try:
        queries.sort(key=int)
    except ValueError:
        queries.sort()

    results = []
    print('qno,intersection,union,p,min,res,ext')
    for q in queries:
        # print(run1[q])
        # print(run2[q])
        # return
        result = rbo.rbo(run1[q], run2[q], p)
        overlap = len(set(run1[q]) & set(run2[q]))
        total = len(set(run1[q]) | set(run2[q]))
        current = {
            'qno': q,
            'intersection': overlap,
            'union': total,
            'p': p,
            'min': result['min'],
            'res': result['res'],
            'ext': result['ext']
        }
        results.append(current)
        print('{},{},{},{},{:f},{:f},{:f}'.format(
            q, overlap, total, p, result['min'], result['res'], result['ext']))

    min_mean = np.mean([r['min'] for r in results])
    res_mean = np.mean([r['res'] for r in results])
    ext_mean = np.mean([r['ext'] for r in results])
    overlap_mean = np.mean([r['intersection'] for r in results])
    total_mean = np.mean([r['union'] for r in results])
    print('{},{},{},{},{:f},{:f},{:f}'.format('mean', overlap_mean, total_mean,
                                              p, min_mean, res_mean, ext_mean))
    return results
Beispiel #5
0
experiment = read_list(
    "/home/gentoo/src/similarity/test_strategies/ex_vs_ex_experiment/expression.csv",
    count)
mean = read_list(
    "/home/gentoo/src/similarity/test_strategies/ex_vs_ex_gene_mean/expression.csv",
    count)
max = read_list(
    "/home/gentoo/src/similarity/test_strategies/ex_vs_ex_gene_max/expression.csv",
    count)
abi = read_list2("ABI-correlation-Znfx1-69524632.csv", count)

print(experiment, mean, max, abi)

print("exp", "mean")
re = rbo(experiment, mean, 0.9)
print(re)

print("exp", "max")
re = rbo(experiment, max, 0.9)
print(re)

print("mean", "max")
re = rbo(mean, max, 0.9)
print(re)

print("ABI", "exp")
re = rbo(abi, experiment, 0.9)
print(re)
print("ABI", "mean")
re = rbo(abi, mean, 0.9)
Beispiel #6
0
        ranks.sort(reverse=True)

        vector = []

        if not topK or topK > len(ranks):
            topK = len(ranks)

        i = 0

        while i < topK:
            rank = ranks[i]
            vector.append(set(myMap[rank]))
            i += 1

        return vector


if __name__ == "__main__":
    topK = None

    if len(sys.argv) == 4:
        topK = int(sys.argv[3])

    v1 = getVector(sys.argv[1], topK)
    v2 = getVector(sys.argv[2], topK)
    result = rbo(v1, v2, p=0.8)

    print(str(result["ext"]))
#    print("res " + str(result["res"]))
#    print("min " + str(result["min"]))
Beispiel #7
0
        ind = max.index(elem)
        max_rank.append([mean_rank[ind][0], elem])
    except ValueError:
        ind = -1
        max_rank.append([1, elem])

score_max = [x[0] for x in max_rank]
score_mean = [x[0] for x in mean_rank]

print(score_max)
print(score_mean)

s, p = wilcoxon(score_max, score_mean, zero_method="pratt")

spear

print("wilcoxon")
print(p)

cor, p = weightedtau(score_max, score_mean)
print("weighted tau")
print(cor)

#print(experiment,mean,max, abi)

print("mean", "max")
re = rbo(mean, max, 0.9)
print(re)
cor, p = spearmanr(mean, max)
print(cor, p)
Beispiel #8
0

Mef2c_667 = read_list("ABI-correlation-Mef2c-667.csv",200)
Znfx1 = read_list("ABI-correlation-Znfx1-69524632.csv",200)

#top 1 hit of Mef2c queried, should be really close to Mef2c as a way to compare 
Nlk = read_list("ABI-correlation-Nlk-76085742.csv",200)
Mef2c_668 = read_list("ABI-correlation-Mef2c-668.csv",200)

Stx1a = read_list("ABI-correlation-Stx1a-2645.csv",200)

Mef2c_79567505 = read_list("ABI-correlation-Mef2c-79567505.csv",200)


print("Mef2c_667","Mef2c_667")
re = rbo(Mef2c_667,Mef2c_667,0.9)
print(re)

print("Mef2c_667","Nlk")
re = rbo(Mef2c_667,Nlk,0.9)
print(re)

print("Mef2c_667","Mef2c_668")
re = rbo(Mef2c_667,Mef2c_668,0.9)
print(re)

print("Mef2c_667","Mef2c_79567505")
re = rbo(Mef2c_667,Mef2c_79567505,0.9)
print(re)