def get_nls_for_note(nid,nlread=None): # nlread = nl.read() if nlread is None: nlread = nl.read() N = filter(lambda n_: n_["id"] == str(nid), nlread["notes"]) if len(N) == 0: print "nothing found for ",nid return None N = N[0] return [N[field] for field in nlread["label_fields"]]
def get_feature_for_note(nid,feature_name,coerce_fn=lambda x: float(x)): nlread = nl.read() N = filter(lambda n_: n_["id"] == str(nid), nlread["notes"]) if len(N) == 0 or feature_name not in nlread['note_fields'] + nlread['feature_fields'] + nlread['label_fields']: ## debug # if len(N) == 0: # print "warning unknown note computing ", nid, feature_name # else: # print "unknown feature name, trying to compute ", feature_name #print "result .... ", Note.objects.filter(id=nid).count(), nl.feature_named(feature_name,Note.objects.filter(id=nid).values()[0]) return nl.compute_feature_named(feature_name,Note.objects.filter(id=nid).values()[0]) N = N[0] return coerce_fn(N[feature_name])
def get_nl_dist_for_cats(arows): nlread = nl.read() freqs = dict([ (cat, nltk.FreqDist()) for cat in cats ] ) ntotals = dict([(cat,0) for cat in cats]) for cat in cats: for n in get_notes_of_cat(arows,cat): freqs[cat] = freqs[cat] + nltk.FreqDist(get_nls_for_note(n,nlread)) ntotals[cat] = ntotals[cat] + len(get_notes_of_cat(arows,cat)) # now do some printing for cat in cats: xx = [(t,y) for t,y in freqs[cat].iteritems()] xx.sort(key=lambda tagfreq:-tagfreq[1]) print cat,"---",xx,"\nPERCENTAGES",[(t,y/(1.0*ntotals[cat])) for t,y in xx],"\n\n" return freqs,ntotals[cat] #,[(cat,dict([ (tag,y/(1.0*ntotals[cat])) for tag,y in freqs[cat].iteritems() ])) for cat in cats]
def _get_all_note_ids(): return [ int(x["id"]) for x in nl.read()["notes"] ]