def filterDocs(request): q=request.GET.get('q') query={} if q: query={'title': re.compile(q, re.I)} if request.GET.get('starred')=='true': query['_id']={ '$in': [ObjectId(x) for x in request.session.get('starred',())] } if request.GET.get('mine')=='true': query['owner']=unicode(request.user) res=pager(request,Docs.find(query, sort=[('_id',pymongo.DESCENDING)]),'docid',False) starred=request.session.get('starred',set()) res['docs']=[{'id': doc.docid, 'starred': u'\u2605' if str(doc._id) in starred else u'\u2606', 'starclass': 'starred' if str(doc._id) in starred else '', 'title': doc.title, 'meta': doc.metadata, 'oid': str(doc._id), 'indexed': doc.pippiDocs, 'pippies': len(doc.pippies), 'type': doc.type, 'tags': doc.autoTags(25), } for doc in (Doc(d=d) for d in res['data'])] return HttpResponse(jdump(res),mimetype="application/json")
def filterDocs(request): q = request.GET.get('q') query = {} if q: query = {'title': re.compile(q, re.I)} if request.GET.get('starred') == 'true': query['_id'] = { '$in': [ObjectId(x) for x in request.session.get('starred', ())] } if request.GET.get('mine') == 'true': query['owner'] = unicode(request.user) res = pager(request, Docs.find(query, sort=[('_id', pymongo.DESCENDING)]), 'docid', False) starred = request.session.get('starred', set()) res['docs'] = [{ 'id': doc.docid, 'starred': u'\u2605' if str(doc._id) in starred else u'\u2606', 'starclass': 'starred' if str(doc._id) in starred else '', 'title': doc.title, 'meta': doc.metadata, 'oid': str(doc._id), 'indexed': doc.pippiDocs, 'pippies': len(doc.pippies), 'type': doc.type, 'tags': doc.autoTags(25), } for doc in (Doc(d=d) for d in res['data'])] return HttpResponse(jdump(res), mimetype="application/json")
def pippi(request, refdoc=None): if not refdoc: return render_to_response('error.html', {'error': 'specify document: %s!' % refdoc}, context_instance=RequestContext(request)) refdoc = Doc(docid=refdoc) template_vars = pager(request, Docs.find({}, ['_id', 'docid']), 'docid', False) docs = sorted([(doc['docid'], doc['_id']) for doc in template_vars['data']]) docslen = Docs.count() template_vars['docs'] = [{ 'id': doc.docid, 'oid': str(doc._id), 'indexed': doc.pippiDocsLen, 'title': doc.title, 'frags': doc.getFrags().count(), 'pippies': len(doc.pippies), 'job': not doc._id in refdoc.pippiDocs, 'type': doc.type, 'docs': len(doc.getRelatedDocIds()), 'tags': doc.autoTags(25) } for doc in (Doc(docid=d) for d, oid in docs if not oid == refdoc._id)] template_vars['stats'] = getOverview() template_vars['refdoc'] = refdoc.docid template_vars['reftitle'] = refdoc.title template_vars['oid'] = str(refdoc._id) template_vars['starred'] = request.session.get('starred', set()) return render_to_response('pippi.html', template_vars, context_instance=RequestContext(request))
def docView(request, doc=None, cutoff=10): if request.GET.get('cutoff', 0): cutoff = int(request.GET['cutoff']) if not doc or not cutoff: return render_to_response( 'error.html', {'error': 'Missing document or wrong cutoff!'}, context_instance=RequestContext(request)) try: d = Doc(docid=doc, owner=request.user) except: raise Http404 cont = d.body relDocs = Docs.find( {'_id': { '$in': list(d.getRelatedDocIds(cutoff=cutoff)) }}, ['docid', 'title']) return render_to_response('docView.html', { 'doc': d, 'oid': d._id, 'user': request.user, 'content': cont, 'related': relDocs, 'cutoff': cutoff, 'cutoffs': ','.join(cutoffSL(d, cutoff)), 'len': d.getFrags(cutoff=cutoff).count() }, context_instance=RequestContext(request))
def starred(request): template_vars=pager(request, Docs.find({'_id' : { '$in': [ObjectId(x) for x in request.session.get('starred',())] }}, sort=[('docid',pymongo.ASCENDING)]), 'docid',False) template_vars['title']='Your starred documents' return _listDocs(request, template_vars)
def starred(request): template_vars = pager( request, Docs.find( { '_id': { '$in': [ObjectId(x) for x in request.session.get('starred', ())] } }, sort=[('docid', pymongo.ASCENDING)]), 'docid', False) template_vars['title'] = 'Your starred documents' return _listDocs(request, template_vars)
def metaView(request,doc=None): if not doc: return render_to_response('error.html', {'error': 'Missing document!'}, context_instance=RequestContext(request)) try: d = Doc(docid=doc) except: form = UploadForm({'docid': doc}) return render_to_response('upload.html', { 'form': form, }, context_instance=RequestContext(request)) relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=5))} }, ['docid','title']) return render_to_response('meta.html', {'doc': d, 'related': relDocs, 'metadata': d.metadata, }, context_instance=RequestContext(request))
def metaView(request,doc=None): if not doc: return render_to_response('error.html', {'error': 'Missing document!'}, context_instance=RequestContext(request)) try: d = Doc(docid=doc, owner=request.user) except: raise Http404 relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=5))} }, ['docid','title']) return render_to_response('meta.html', {'doc': d, 'oid': d._id, 'related': relDocs, 'metadata': d.metadata, }, context_instance=RequestContext(request))
def starred(request): template_vars=pager(request,Docs.find({'_id' : { '$in': [ObjectId(x) for x in request.session.get('starred',())] }},['_id','docid']),'docid',False) docs=[(doc['docid'],doc['_id']) for doc in template_vars['data']] docslen=Docs.count() template_vars['docs']=[{'id': doc.docid, 'oid': str(doc._id), 'indexed': doc.pippiDocsLen, 'title': doc.title, 'frags': doc.getFrags().count(), 'pippies': len(doc.pippies), 'type': doc.type, 'docs': len(doc.getRelatedDocIds()), 'tags': doc.autoTags(25) } for doc in (Doc(docid=d) for d,oid in docs)] template_vars['stats']=getOverview() template_vars['starred']=request.session.get('starred',set()) template_vars['title']='Your starred documents' return render_to_response('corpus.html', template_vars, context_instance=RequestContext(request))
def search(request): q = cgi.escape(request.GET.get('q', '')) if not q: return render_to_response('error.html', {'error': 'Missing search query!'}, context_instance=RequestContext(request)) filtr = [] lang = guessLanguage(q) swords = stopmap.stopmap.get(lang, stopmap.stopmap['en']) engine = getStemmer(lang) for word in nltk.tokenize.wordpunct_tokenize(unicode(q)): # stem each word stem = engine.stem(word.encode('utf8')) if stem and stem[0] not in swords and len(stem[0]) > 1: filtr.append(stem[0]) else: filtr.append('') matches = [ x['_id'] for x in DocStems.find({'value': { '$all': filtr }}, ['_id']) ] template_vars = pager(request, Docs.find({"stemsid": { '$in': matches }}), 'docid', False) template_vars['getparams'] = request.GET.urlencode() template_vars['q'] = q template_vars['stats'] = getOverview() template_vars['starred'] = request.session.get('starred', set()) template_vars['docs'] = [{ 'id': doc.docid, 'oid': str(doc._id), 'indexed': doc.pippiDocsLen, 'title': doc.title, 'frags': doc.getFrags().count(), 'pippies': len(doc.pippies), 'type': doc.type, 'docs': len(doc.getRelatedDocIds()), 'tags': doc.autoTags(25) } for doc in (Doc(d=d) for d in template_vars['data'])] return render_to_response('search.html', template_vars, context_instance=RequestContext(request))
def metaView(request, doc=None): if not doc: return render_to_response('error.html', {'error': 'Missing document!'}, context_instance=RequestContext(request)) try: d = Doc(docid=doc, owner=request.user) except: raise Http404 relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=5)) }}, ['docid', 'title']) return render_to_response('meta.html', { 'doc': d, 'oid': d._id, 'related': relDocs, 'metadata': d.metadata, }, context_instance=RequestContext(request))
def docView(request,doc=None,cutoff=10): if request.GET.get('cutoff', 0): cutoff = int(request.GET['cutoff']) if not doc or not cutoff: return render_to_response('error.html', {'error': 'Missing document or wrong cutoff!'}, context_instance=RequestContext(request)) try: d = Doc(docid=doc) except: form = UploadForm({'docid': doc}) return render_to_response('upload.html', { 'form': form, }, context_instance=RequestContext(request)) cont = d.body relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=cutoff))} }, ['docid','title']) return render_to_response('docView.html', {'doc': d, 'oid': d._id, 'user': request.user, 'content': cont, 'related': relDocs, 'cutoff': cutoff, 'cutoffs': ','.join(cutoffSL(d,cutoff)), 'len': d.getFrags(cutoff=cutoff).count()}, context_instance=RequestContext(request))
def main(): print "updateing pippies.relevance" pippies=Pippies.find({},['docs','len']) pippieslen=pippies.count() i=1 for pippi in pippies: if (i*100/pippieslen)!=((i-1)*100/pippieslen): if (i*100/pippieslen) % 10 == 0: sys.stdout.write("%d" % (i*100/pippieslen)) sys.stdout.flush() else: sys.stdout.write('.') sys.stdout.flush() if len(pippi['docs'])>0: Pippies.update({'_id' : pippi['_id']}, { '$set': { 'relevance': float(pippi['len'])/float(len(pippi['docs'])), 'docslen': len(pippi['docs']),}, }) i=i+1 sys.stdout.write('\n') sys.stdout.flush() print "updateing docs.idf" docs=Docs.find({},['termcnt','docid','stemsid','rawid']) docslen=docs.count() i=1 for dd in docs: if (i*100/docslen)!=((i-1)*100/docslen): if (i*100/docslen) % 10 == 0: sys.stdout.write("%d" % (i*100/docslen)) sys.stdout.flush() else: sys.stdout.write('.') sys.stdout.flush() Docs.update({'_id': dd['_id']},{ '$set': { 'tfidf': Doc(d=dd).tfidf } }) i=i+1 sys.stdout.write('\n') sys.stdout.flush()
def search(request): q = cgi.escape(request.GET.get('q','')) if not q: return render_to_response('error.html', {'error': 'Missing search query!'}, context_instance=RequestContext(request)) filtr=[] lang=guessLanguage(q) swords=stopmap.stopmap.get(lang,stopmap.stopmap['en']) engine=getStemmer(lang) for word in nltk.tokenize.wordpunct_tokenize(unicode(q)): # stem each word stem=engine.stem(word.encode('utf8')) if stem and stem[0] not in swords and len(stem[0])>1: filtr.append(stem[0]) else: filtr.append('') matches=[x['_id'] for x in DocStems.find({'value': { '$all' : filtr }},['_id'])] template_vars=pager(request, Docs.find({"stemsid": { '$in': matches}}), 'docid', False) template_vars['getparams']=request.GET.urlencode() template_vars['q']=q template_vars['stats']=getOverview() template_vars['starred']=request.session.get('starred',set()) template_vars['docs']=[{'id': doc.docid, 'oid': str(doc._id), 'indexed': doc.pippiDocsLen, 'title': doc.title, 'frags': doc.getFrags().count(), 'pippies': len(doc.pippies), 'type': doc.type, 'docs': len(doc.getRelatedDocIds()), 'tags': doc.autoTags(25) } for doc in (Doc(d=d) for d in template_vars['data'])] return render_to_response('search.html', template_vars, context_instance=RequestContext(request))
def pippi(request,refdoc=None): if not refdoc: return render_to_response('error.html', {'error': 'specify document: %s!' % refdoc}, context_instance=RequestContext(request)) refdoc=Doc(docid=refdoc) template_vars=pager(request,Docs.find({},['_id','docid']),'docid',False) docs=sorted([(doc['docid'],doc['_id']) for doc in template_vars['data']]) docslen=Docs.count() template_vars['docs']=[{'id': doc.docid, 'oid': str(doc._id), 'indexed': doc.pippiDocsLen, 'title': doc.title, 'frags': doc.getFrags().count(), 'pippies': len(doc.pippies), 'job': not doc._id in refdoc.pippiDocs, 'type': doc.type, 'docs': len(doc.getRelatedDocIds()), 'tags': doc.autoTags(25) } for doc in (Doc(docid=d) for d,oid in docs if not oid == refdoc._id)] template_vars['stats']=getOverview() template_vars['refdoc']=refdoc.docid template_vars['reftitle']=refdoc.title template_vars['oid']=str(refdoc._id) template_vars['starred']=request.session.get('starred',set()) return render_to_response('pippi.html', template_vars, context_instance=RequestContext(request))
def docView(request,doc=None,cutoff=10): if request.GET.get('cutoff', 0): cutoff = int(request.GET['cutoff']) if not doc or not cutoff: return render_to_response('error.html', {'error': 'Missing document or wrong cutoff!'}, context_instance=RequestContext(request)) try: d = Doc(docid=doc) except: form = UploadForm({'docid': doc}) return render_to_response('upload.html', { 'form': form, }, context_instance=RequestContext(request)) tooltips={} cont = d.body relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=cutoff))} }, ['docid','title']) ls = [] matches = 0 for l in d.getFrags(cutoff=cutoff): if( l['l'] < cutoff): break # for unique locset - optimalization?! if l['txt'] in ls: continue ls.append(l['txt']) t = l['txt'] # for valid matches btxt = '' etxt = '' if t[0][0].isalnum(): btxt = '\W' if t[-1][-1].isalnum(): etxt = '\W' rtxt = btxt+'\s*(?:<[^>]*>\s*)*'.join([re.escape(x) for x in t])+etxt regex=re.compile(rtxt, re.I | re.M | re.U) i=0 offset = 0 #print "[!] Finding: %s\n\tPos: %s\n\t%s\n" % (' '.join(t), l['pos'], rtxt) if not l['pippi'] in tooltips: tooltips[l['pippi']]=annotatePippi(d,l,cutoff) for r in regex.finditer(cont): #print '[!] Match: %s\n\tStartpos: %d\n\tEndpos: %d' % (r.group(), r.start(), r.end()) span = (('<span class="highlight %s">') % l['pippi'], '</span>') start = r.start()+offset if btxt: start += 1 end = r.end()+offset if etxt: end -= 1 match, n = re.compile(r'((?:\s*<[^>]+>)+)', re.M | re.U).subn(r'%s\1%s' % (span[1], span[0]), cont[start:end]) cont = cont[:start]+span[0]+match+span[1]+cont[end:] offset += (n+1)*(len(span[0])+len(span[1])) matches += 1 #print '_'*60 #print '-'*120 cont=anchorArticles(cont) #print "[!] Rendering\n\tContent length: %d" % len(cont) return render_to_response('docView.html', {'doc': d, 'oid': d._id, 'user': request.user, 'content': cont, 'related': relDocs, 'cutoff': cutoff, 'cutoffs': ','.join(cutoffSL(d,cutoff)), 'len': len(ls), 'tooltips': '\n'.join(tooltips.values()), 'matches': matches}, context_instance=RequestContext(request))
def listDocs(request): template_vars=pager(request,Docs.find(sort=[('docid',pymongo.DESCENDING)]),'docid',False) template_vars['title']='Complete Corpus of pippi longstrings' return _listDocs(request, template_vars)
def listDocs(request): template_vars = pager(request, Docs.find(sort=[('docid', pymongo.DESCENDING)]), 'docid', False) template_vars['title'] = 'Complete Corpus of pippi longstrings' return _listDocs(request, template_vars)