def cameo_internal_rankcorr():
    assert False, "not sure we should be using this anymore"
    finescores = []
    coarsescores=[]
    counts=[]
    for p in path2code:
        code = path2code[p]
        if len(code)==2: continue
        if path2count[p] < 5: continue
        finescore = cameo_code2score.get(code)
        if finescore is None: continue
        coarsescore=cameo_code2score[code[:2]]
        # print code, finescore, coarsescore
        finescores.append(finescore)
        coarsescores.append(coarsescore)
        counts.append(path2count[p])
    print "%d path types over %d corpus event instances" % ( len(counts), sum(counts) )
    r1 = rankcorr.rankcorr(finescores, coarsescores, [1]*len(counts), method='gk')
    r2 = rankcorr.rankcorr(finescores, coarsescores, counts, method='gk')
    print "raw rank corr = %.4f, weighted by corpus freq = %.4f" % (r1, r2)
def rankcorr_purity_null():
    global scored_paths, path2score, cPathFrame
    K = cPathFrame.shape[1]

    for itr in xrange(10000):
        gold = []
        fakepred = []
        counts = []

        for p in scored_paths:
            gold.append(path2score[p])
            fakepred.append(random.randrange(K))
            counts.append(path2count[p])
        r = rankcorr.rankcorr(fakepred, gold, counts, method='kendall')
        print r
def rankcorr_purity():
    # only works for scaled models
    global modeldir, cPathFrame, num2path, path2num, scored_paths, frameScales

    # posterior prob that a pair is concordant
    # path-level arrays (token-level counts)
    model_scores = []
    gold_scores = []
    counts = []

    for k in range(cPathFrame.shape[1]):
        paths = [p for p in scored_paths if cPathFrame[path2num[p],k] > 0]
        for p in paths:
            model_scores.append( path2score[p] )
            gold_scores.append( frameScales[k] )
            counts.append( cPathFrame[path2num[p], k] )
    print len(counts)
    print "rankcorr = %.4f" % (rankcorr.rankcorr(model_scores, gold_scores, counts, method='kendall'))