def cutoffSL(doc, cutoff): m=Code("function(){ emit( this.len , { count : 1 } );}") r=Code("function (key, values) { var count = 0; values.forEach(function (v) {count += v.count;}); return {count: count}; }") if Pippies.count()>0: lens=dict([(x['_id'],int(x['value']['count'])) for x in Pippies.map_reduce(m,r,'cutoff sparkline', query={'docs': doc._id }).find()]) else: lens={} if lens.keys(): return [str(lens[x]) if x in lens else '0' for x in xrange(int(max(lens.keys())+1))][4:cutoff] else: return []
def getOverview(): stats=[] stats.append({'title': 'Total documents', 'value': Docs.count(), 'text': "%s Documents" % Docs.count()}) stats.append({'title': 'Total Pippies', 'value': Pippies.count(), 'text': "with %s Pippies" % Pippies.count()}) stats.append({'title': 'Locations', 'value': Frags.count(), 'text': "in %s Locations" % Frags.count()}) return stats
def save(self,d1,d2,pkt): # todo new code to directly addtoset mongo-style if not pkt: return pippi=Pippi(pkt['pippi']) Docs.update({'_id': d1._id}, { '$addToSet' : { 'pippies' : pippi._id } }) Docs.update({'_id': d2._id}, { '$addToSet' : { 'pippies' : pippi._id } }) Pippies.update({'_id' : pippi._id}, {'$addToSet': { 'docs' : { '$each' : [d for d in [d1._id, d2._id]]}}, '$inc' : { 'docslen' : 2 }}) [Frags.save({'pos': p['pos'], 'txt': p['txt'], 'l': pkt['l'], 'doc': d, 'pippi': pippi._id}) for (d,p) in [(d1._id, p) for p in pkt['d1ps']]+[(d2._id, p) for p in pkt['d2ps']]] return pkt
def __init__(self, pippi, oid=None, frag=None): if oid: # get by mongo oid frag=Pippies.find_one({"_id": oid}) elif pippi: # get by pippi frag=Pippies.find_one({"pippi": ' '.join(pippi)}) if(frag): self.__dict__=frag self.pippi=tuple(self.pippi.split(" ")) else: self.__dict__={'pippi': tuple(pippi), 'len': len(pippi), 'docs': []} # should a be a set of {'pos':p,'txt':txt,'l':l,'doc':_id} self.save()
def getRelatedDocIds(self, cutoff=7): return set([doc for pippi in Pippies.find({'len': { '$gte': int(cutoff)}, 'docs': self._id}, ['docs']) for doc in pippi['docs'] if doc != self._id])
def search(request): q = cgi.escape(request.GET.get('q','')) if not q: return render_to_response('error.html', {'error': 'Missing search query!'}, context_instance=RequestContext(request)) orderBy = cgi.escape(request.GET.get('orderby','')) # TODO also order by docslen (need to add that to bulksaver) if not orderBy in ['relevance', 'docslen', 'len', ]: orderBy='len' # TODO also handle desc/asc via the tableheader on the web ui orderDesc = True engine = hunspell.HunSpell(settings.DICT+'.dic', settings.DICT+'.aff') filtr=[] for word in [token for token in nltk.tokenize.wordpunct_tokenize(unicode(q))]: # stem each word stem=engine.stem(word.encode('utf8')) if stem: filtr.append(stem[0]) else: filtr.append('') template_vars=pager(request,Pippies.find({'pippi': re.compile(' '.join(filtr))}),orderBy,orderDesc) template_vars['pippies']=[{'id': pippi['_id'], 'pippi':'%s<span class="hilite-query">%s</span>%s' % ' '.join([p if p else '*' for p in pippi['pippi'].split(' ')]).partition(' '.join([p if p else '*' for p in filtr])), 'docslen':pippi['docslen'], 'len':len(pippi['pippi'].split(' ')), 'relevance':pippi.get('relevance',0),} for pippi in template_vars['data']] template_vars['getparams']=request.GET.urlencode() template_vars['q']=q return render_to_response('search.html', template_vars, context_instance=RequestContext(request))
def __init__(self, pippi, oid=None, frag=None): if oid: # get by mongo oid frag = Pippies.find_one({"_id": oid}) elif pippi: # get by pippi frag = Pippies.find_one({"pippi": ' '.join(pippi)}) if (frag): self.__dict__ = frag self.pippi = tuple(self.pippi.split(" ")) else: self.__dict__ = { 'pippi': tuple(pippi), 'len': len(pippi), 'docs': [] } # should a be a set of {'pos':p,'txt':txt,'l':l,'doc':_id} self.save()
def getOverview(): stats = [] stats.append({ 'title': 'Total documents', 'value': Docs.count(), 'text': "%s Documents" % Docs.count() }) stats.append({ 'title': 'Total Pippies', 'value': Pippies.count(), 'text': "with %s Pippies" % Pippies.count() }) stats.append({ 'title': 'Locations', 'value': Frags.count(), 'text': "in %s Locations" % Frags.count() }) return stats
def cutoffSL(doc, cutoff): m = Code("function(){ emit( this.len , { count : 1 } );}") r = Code( "function (key, values) { var count = 0; values.forEach(function (v) {count += v.count;}); return {count: count}; }" ) if Pippies.count() > 0: lens = dict([(x['_id'], int(x['value']['count'])) for x in Pippies.map_reduce( m, r, 'cutoff sparkline', query={ 'docs': doc._id }).find()]) else: lens = {} if lens.keys(): return [ str(lens[x]) if x in lens else '0' for x in xrange(int(max(lens.keys()) + 1)) ][4:cutoff] else: return []
def main(): print "updateing pippies.relevance" pippies=Pippies.find({},['docs','len']) pippieslen=pippies.count() i=1 for pippi in pippies: if (i*100/pippieslen)!=((i-1)*100/pippieslen): if (i*100/pippieslen) % 10 == 0: sys.stdout.write("%d" % (i*100/pippieslen)) sys.stdout.flush() else: sys.stdout.write('.') sys.stdout.flush() if len(pippi['docs'])>0: Pippies.update({'_id' : pippi['_id']}, { '$set': { 'relevance': float(pippi['len'])/float(len(pippi['docs'])), 'docslen': len(pippi['docs']),}, }) i=i+1 sys.stdout.write('\n') sys.stdout.flush() print "updateing docs.idf" docs=Docs.find({},['termcnt','docid','stemsid','rawid']) docslen=docs.count() i=1 for dd in docs: if (i*100/docslen)!=((i-1)*100/docslen): if (i*100/docslen) % 10 == 0: sys.stdout.write("%d" % (i*100/docslen)) sys.stdout.flush() else: sys.stdout.write('.') sys.stdout.flush() Docs.update({'_id': dd['_id']},{ '$set': { 'tfidf': Doc(d=dd).tfidf } }) i=i+1 sys.stdout.write('\n') sys.stdout.flush()
def pippies(request): filtr = {} template_vars = {} docfilter = None relfilter = None cutoff = None try: cutoff = int(cgi.escape(request.GET.get('cutoff', '7'))) except: pass if cutoff: filtr['len'] = {'$gte': cutoff} try: docfilter = ObjectId(cgi.escape(request.GET.get('doc', ''))) except: pass if docfilter: filtr['docs'] = docfilter try: relfilter = int(cgi.escape(request.GET.get('relevance', ''))) except: pass if relfilter: filtr['relevance'] = relfilter # todo add sortable column headers ala http://djangosnippets.org/snippets/308/ orderBy = cgi.escape(request.GET.get('orderby', 'relevance')) orderDesc = True if '1' == cgi.escape(request.GET.get('desc', '1')) else False template_vars = pager(request, Pippies.find(filtr), orderBy, orderDesc) template_vars['pippies'] = [{ 'id': pippi['_id'], 'pippi': ' '.join([p if p else '*' for p in pippi['pippi'].split(' ')]), 'docslen': len(pippi['docs']), 'relevance': pippi.get('relevance', 0), } for pippi in template_vars['data']] template_vars['doc'] = docfilter if docfilter: doc = Docs.find_one({'_id': docfilter}, ['docid', 'title']) template_vars[ 'docTitle'] = doc['title'] if 'title' in doc else doc['docid'] return render_to_response('pippies.html', template_vars, context_instance=RequestContext(request))
def pippies(request): filtr={} template_vars={} docfilter=None relfilter=None cutoff=None try: cutoff = int(cgi.escape(request.GET.get('cutoff','7'))) except: pass if cutoff: filtr['len']={ '$gte': cutoff } try: docfilter = ObjectId(cgi.escape(request.GET.get('doc',''))) except: pass if docfilter: filtr['docs']=docfilter try: relfilter = int(cgi.escape(request.GET.get('relevance',''))) except: pass if relfilter: filtr['relevance']=relfilter # todo add sortable column headers ala http://djangosnippets.org/snippets/308/ orderBy = cgi.escape(request.GET.get('orderby','relevance')) orderDesc = True if '1'==cgi.escape(request.GET.get('desc','1')) else False template_vars=pager(request,Pippies.find(filtr),orderBy,orderDesc) template_vars['pippies']=[{'id': pippi['_id'], 'pippi': ' '.join([p if p else '*' for p in pippi['pippi'].split(' ')]), 'docslen':len(pippi['docs']), 'relevance':pippi.get('relevance',0),} for pippi in template_vars['data']] template_vars['doc']=docfilter if docfilter: doc=Docs.find_one({'_id': docfilter},['docid', 'title']) template_vars['docTitle']=doc['title'] if 'title' in doc else doc['docid'] return render_to_response('pippies.html', template_vars, context_instance=RequestContext(request))
def save(self): tmp=self.pippi self.pippi=" ".join(self.pippi) self.__dict__['_id']=Pippies.save(self.__dict__) self.pippi=tmp
def save(self): self.__dict__['_id']=Pippies.save(self.__dict__)
def save(self): tmp = self.pippi self.pippi = " ".join(self.pippi) self.__dict__['_id'] = Pippies.save(self.__dict__) self.pippi = tmp
def save(self): self.__dict__['_id'] = Pippies.save(self.__dict__)