コード例 #1
0
ファイル: intention.py プロジェクト: kinow/listit-server
def aov_padded(arows, feature_name, min_N_per_user=20, formula = "%s ~ cat + participant"):
    fla = formula % feature_name
    fmla = ro.Formula(fla)
    fmla.environment = ro.Environment()
    env = fmla.environment
    feats = r.c()
    owners = r.c()
    cats = r.c()
    for row in arows:
        rf = get_feature_for_note(row[0],feature_name)
        if rf is None: continue
        feats = r.c(feats,rf)
        cats = r.c(cats,row[aci('primary')])
        owners = r.c(owners,row[aci('owner_id')])


    # if we have less than N notes per person, we have less than a representative sample :(
    # so, to prevent the world from blowing, we fill in     
        
    notes_per_owner_in_arows = nltk.FreqDist( [ row[aci('owner_id')]  for row in arows ] )
    note_ids = set( [row[0] for row in arows ])
    for owner_id,v in notes_per_owner_in_arows.iteritems():
        owned_notes_not_yet_chosen = [x for x in User.objects.filter(id=owner_id)[0].note_owner.all() if x.id not in note_ids and len(x.contents.strip()) > 0]
        if v < min_N_per_user:
            to_choose_k = min(len(owned_notes_not_yet_chosen),min_N_per_user-v)
            chosen = random.sample(owned_notes_not_yet_chosen,to_choose_k)
            assert to_choose_k == len(chosen), "Could not find note, somethings wrong %d %d" % (to_choose_k , len(chosen) )
            for chnote in chosen:
                feat = nl.compute_feature_named(feature_name,chnote)
                feats = r.c(feats,nl.compute_feature_named(feature_name,chnote))
                cats = r.c(cats,'uncategorized')
                owners = r.c(owners,owner_id)
            print "adding ",to_choose_k," to ", owner_id

    env[feature_name] = feats
    env['cat'] = r('as.factor')(cats)
    env['participant'] = r('as.factor')(owners)

    print 'feats', env[feature_name]
    print 'cat', env['cat']
    print 'part', env['participant']
    return fmla
コード例 #2
0
ファイル: intention.py プロジェクト: kinow/listit-server
def compute_avg_for_overall_interesting(interesting_users,feature_name):
    interesting_notes = reduce(lambda x,y: x+y, [ list(i.note_owner.all().values()) for i in interesting_users])
    def printstats(varr):
        return [("len: ", len(varr)),
                ("mean: ",mean(varr)),
                ("median: ",median(varr)),
                ("min ", min(varr)),
                ("max ", max(varr)),               
                ("stdev:", pow(ca.var(varr),0.5) if len(varr) > 1 else "CANT COMPUTE len = 1")]

    
    print printstats([ nl.compute_feature_named(feature_name, n) for n in interesting_notes if nl.compute_feature_named(feature_name, n) is not None and nl.compute_feature_named(feature_name, n) >= 0])
コード例 #3
0
ファイル: intention.py プロジェクト: kinow/listit-server
def get_feature_for_note(nid,feature_name,coerce_fn=lambda x: float(x)):
    nlread = nl.read()
    N = filter(lambda n_: n_["id"] == str(nid), nlread["notes"])
    if len(N) == 0 or feature_name not in nlread['note_fields'] + nlread['feature_fields'] + nlread['label_fields']:
        ## debug
        # if len(N) == 0:
        #             print "warning unknown note computing ", nid, feature_name
        #         else:
        #             print "unknown feature name, trying to compute ", feature_name
        
        #print "result .... ", Note.objects.filter(id=nid).count(), nl.feature_named(feature_name,Note.objects.filter(id=nid).values()[0])
        return nl.compute_feature_named(feature_name,Note.objects.filter(id=nid).values()[0])

    N = N[0]
    return coerce_fn(N[feature_name])