def pippi(request, refdoc=None): if not refdoc: return render_to_response('error.html', {'error': 'specify document: %s!' % refdoc}, context_instance=RequestContext(request)) refdoc = Doc(docid=refdoc) template_vars = pager(request, Docs.find({}, ['_id', 'docid']), 'docid', False) docs = sorted([(doc['docid'], doc['_id']) for doc in template_vars['data']]) docslen = Docs.count() template_vars['docs'] = [{ 'id': doc.docid, 'oid': str(doc._id), 'indexed': doc.pippiDocsLen, 'title': doc.title, 'frags': doc.getFrags().count(), 'pippies': len(doc.pippies), 'job': not doc._id in refdoc.pippiDocs, 'type': doc.type, 'docs': len(doc.getRelatedDocIds()), 'tags': doc.autoTags(25) } for doc in (Doc(docid=d) for d, oid in docs if not oid == refdoc._id)] template_vars['stats'] = getOverview() template_vars['refdoc'] = refdoc.docid template_vars['reftitle'] = refdoc.title template_vars['oid'] = str(refdoc._id) template_vars['starred'] = request.session.get('starred', set()) return render_to_response('pippi.html', template_vars, context_instance=RequestContext(request))
def docView(request, doc=None, cutoff=10): if request.GET.get('cutoff', 0): cutoff = int(request.GET['cutoff']) if not doc or not cutoff: return render_to_response( 'error.html', {'error': 'Missing document or wrong cutoff!'}, context_instance=RequestContext(request)) try: d = Doc(docid=doc, owner=request.user) except: raise Http404 cont = d.body relDocs = Docs.find( {'_id': { '$in': list(d.getRelatedDocIds(cutoff=cutoff)) }}, ['docid', 'title']) return render_to_response('docView.html', { 'doc': d, 'oid': d._id, 'user': request.user, 'content': cont, 'related': relDocs, 'cutoff': cutoff, 'cutoffs': ','.join(cutoffSL(d, cutoff)), 'len': d.getFrags(cutoff=cutoff).count() }, context_instance=RequestContext(request))
def filterDocs(request): q = request.GET.get('q') query = {} if q: query = {'title': re.compile(q, re.I)} if request.GET.get('starred') == 'true': query['_id'] = { '$in': [ObjectId(x) for x in request.session.get('starred', ())] } if request.GET.get('mine') == 'true': query['owner'] = unicode(request.user) res = pager(request, Docs.find(query, sort=[('_id', pymongo.DESCENDING)]), 'docid', False) starred = request.session.get('starred', set()) res['docs'] = [{ 'id': doc.docid, 'starred': u'\u2605' if str(doc._id) in starred else u'\u2606', 'starclass': 'starred' if str(doc._id) in starred else '', 'title': doc.title, 'meta': doc.metadata, 'oid': str(doc._id), 'indexed': doc.pippiDocs, 'pippies': len(doc.pippies), 'type': doc.type, 'tags': doc.autoTags(25), } for doc in (Doc(d=d) for d in res['data'])] return HttpResponse(jdump(res), mimetype="application/json")
def job(request): d1 = request.GET.get('d1', '') d2 = request.GET.get('d2', '') try: D1 = Doc(docid=d1, owner=request.user) except: return render_to_response('error.html', {'error': 'wrong document: "%s"!' % d1}, context_instance=RequestContext(request)) try: D2 = Doc(docid=d2, owner=request.user) except: return render_to_response('error.html', {'error': 'specify document: "%s"!' % d2}, context_instance=RequestContext(request)) lcs.pippi(D1, D2) return HttpResponseRedirect('/doc/%s' % (d1))
def jobs(request): rdoc = request.GET.get('doc') try: refdoc = Doc(oid=ObjectId(rdoc)) except: return render_to_response('error.html', {'error': 'wrong document: "%s"!' % rdoc}, context_instance=RequestContext(request)) failed = [] for doc in request.GET.getlist('ids'): if doc == rdoc: continue try: od = Doc(oid=ObjectId(doc)) except: failed.append(doc) continue lcs.pippi(refdoc, od) return HttpResponseRedirect('/doc/%s' % (refdoc.docid))
def setTitle(request, docid): try: d = Doc(docid=docid) except: return HttpResponse('') if request.user.is_authenticated() and request.user.username == d.owner: d.title = request.POST.get('value') d.save() return HttpResponse(d.title) return HttpResponse(d.title)
def delete(request, docid): try: d = Doc(docid=docid) except: return render_to_response('error.html', {'error': 'no such document: "%s"!' % docid}, context_instance=RequestContext(request)) if len(d.pippies) > 0 or len(d.pippiDocs) > 0: # TODO support cascade deletion in doc.py return render_to_response('error.html', { 'error': 'This document has been pippied, and cannot be removed without causing missing references in the documents it has been pippied against!' }, context_instance=RequestContext(request)) if request.user.is_authenticated() and request.user.username == d.owner: d.delete() return HttpResponseRedirect('/browse')
def _listDocs(request, template_vars, tpl='corpus.html'): template_vars['docs'] = [{ 'id': doc.docid, 'oid': str(doc._id), 'indexed': doc.pippiDocsLen, 'title': doc.title, 'frags': doc.getFrags().count(), 'pippies': len(doc.pippies), 'type': doc.type, 'docs': len(doc.getRelatedDocIds()), 'tags': doc.autoTags(25) } for doc in (Doc(d=d) for d in template_vars['data'])] template_vars['stats'] = getOverview() template_vars['starred'] = request.session.get('starred', set()) return render_to_response(tpl, template_vars, context_instance=RequestContext(request))
def search(request): q = cgi.escape(request.GET.get('q', '')) if not q: return render_to_response('error.html', {'error': 'Missing search query!'}, context_instance=RequestContext(request)) filtr = [] lang = guessLanguage(q) swords = stopmap.stopmap.get(lang, stopmap.stopmap['en']) engine = getStemmer(lang) for word in nltk.tokenize.wordpunct_tokenize(unicode(q)): # stem each word stem = engine.stem(word.encode('utf8')) if stem and stem[0] not in swords and len(stem[0]) > 1: filtr.append(stem[0]) else: filtr.append('') matches = [ x['_id'] for x in DocStems.find({'value': { '$all': filtr }}, ['_id']) ] template_vars = pager(request, Docs.find({"stemsid": { '$in': matches }}), 'docid', False) template_vars['getparams'] = request.GET.urlencode() template_vars['q'] = q template_vars['stats'] = getOverview() template_vars['starred'] = request.session.get('starred', set()) template_vars['docs'] = [{ 'id': doc.docid, 'oid': str(doc._id), 'indexed': doc.pippiDocsLen, 'title': doc.title, 'frags': doc.getFrags().count(), 'pippies': len(doc.pippies), 'type': doc.type, 'docs': len(doc.getRelatedDocIds()), 'tags': doc.autoTags(25) } for doc in (Doc(d=d) for d in template_vars['data'])] return render_to_response('search.html', template_vars, context_instance=RequestContext(request))
def metaView(request, doc=None): if not doc: return render_to_response('error.html', {'error': 'Missing document!'}, context_instance=RequestContext(request)) try: d = Doc(docid=doc, owner=request.user) except: raise Http404 relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=5)) }}, ['docid', 'title']) return render_to_response('meta.html', { 'doc': d, 'oid': d._id, 'related': relDocs, 'metadata': d.metadata, }, context_instance=RequestContext(request))
def main(): print "updateing pippies.relevance" pippies=Pippies.find({},['docs','len']) pippieslen=pippies.count() i=1 for pippi in pippies: if (i*100/pippieslen)!=((i-1)*100/pippieslen): if (i*100/pippieslen) % 10 == 0: sys.stdout.write("%d" % (i*100/pippieslen)) sys.stdout.flush() else: sys.stdout.write('.') sys.stdout.flush() if len(pippi['docs'])>0: Pippies.update({'_id' : pippi['_id']}, { '$set': { 'relevance': float(pippi['len'])/float(len(pippi['docs'])), 'docslen': len(pippi['docs']),}, }) i=i+1 sys.stdout.write('\n') sys.stdout.flush() print "updateing docs.idf" docs=Docs.find({},['termcnt','docid','stemsid','rawid']) docslen=docs.count() i=1 for dd in docs: if (i*100/docslen)!=((i-1)*100/docslen): if (i*100/docslen) % 10 == 0: sys.stdout.write("%d" % (i*100/docslen)) sys.stdout.flush() else: sys.stdout.write('.') sys.stdout.flush() Docs.update({'_id': dd['_id']},{ '$set': { 'tfidf': Doc(d=dd).tfidf } }) i=i+1 sys.stdout.write('\n') sys.stdout.flush()
def frags(request): filtr = {} template_vars = {} docfilter = None cutoff = None pippifilter = None try: docfilter = ObjectId(cgi.escape(request.GET.get('doc', ''))) except: pass if docfilter: filtr['doc'] = docfilter try: pippifilter = ObjectId(cgi.escape(request.GET.get('pippi', ''))) except: pass if pippifilter: filtr['pippi'] = pippifilter else: try: cutoff = int(cgi.escape(request.GET.get('cutoff', '7'))) except: pass if cutoff: filtr['l'] = {'$gte': cutoff} orderBy = 'l' orderDesc = True template_vars = pager(request, Frags.find(filtr), orderBy, orderDesc) prevDoc = None template_vars['frags'] = [] for frag in template_vars['data']: p = Pippi('', oid=frag['pippi']) d = Doc(oid=frag['doc']) if pippifilter: frag['txt'] = diffFrag(prevDoc, frag['txt']) prevDoc = frag['txt'] template_vars['frags'].append({ '_id': frag['_id'], 'pos': frag['pos'], 'txt': " ".join(frag['txt']), 'len': frag['l'], 'score': sum([d.tfidf.get(t, 0) for t in p.pippi]), 'pippi': p, 'doc': d, }) template_vars['pippi'] = pippifilter template_vars['doc'] = docfilter if docfilter: template_vars['docTitle'] = Docs.find_one({'_id': docfilter}, ['docid'])['docid'] if pippifilter: template_vars[ 'pippiFilter'] = 1 #" ".join(Pippies.find_one({'_id': pippifilter},['pippi'])['pippi']) return render_to_response('frags.html', template_vars, context_instance=RequestContext(request))
if not stem in frags: frags[stem] = [l, [], []] # append position to list appointed by sel sel = ((end - l) / ld) > 0 frags[stem][1 + sel].append(end - l - (sel * ld)) if saver: saver.addDocs(D1, D2) # also save as annotations saveNotes(D1, D2, frags, settings.ROOT_URL) return [(saver.save( D1, D2, bulksaver.lcsPkt(sorted(a), sorted(b), l, stem, D1, D2)) if saver else (l, stem, sorted(a), sorted(b))) for stem, (l, a, b) in frags.items() if a and b] if __name__ == "__main__": #from lenx.view.eurlex import Doc from lenx.view.doc import Doc #import pprint import sys #frag=LCS(doc1,doc2) #pprint.pprint(frag.root) d1 = Doc(docid=sys.argv[1].strip('\t\n')) #pprint.pprint(d1.extractMetadata()) d2 = Doc(docid=sys.argv[2].strip('\t\n')) #pprint.pprint(d2.extractMetadata()) pips = pippi(d1, d2, None) #pprint.pprint(pips)
def getDocs(self, d, cutoff=7): return set([Doc(oid=oid) for oid in self.docs if oid != d._id])
def _getScore(self): d = Doc(oid=self.doc) p = Pippi('', oid=self.pippi) return sum([d.tfidf.get(t, 0) for t in p.pippi])
float(1 + self.num_docs) / (1 + self.term_num_docs[term])) def get_doc_keywords(self, doc): """Retrieve terms and corresponding tf-idf for the specified document. The returned terms are ordered by decreasing tf-idf. """ tfidf = {} doclen = len(doc.stems) for word in doc.termcnt: # The definition of TF specifies the denominator as the count of terms # within the document, but for short documents, I've found heuristically # that sometimes len(tokens_set) yields more intuitive results. mytf = float(doc.termcnt[word]) / doclen myidf = self.get_idf(word) tfidf[word] = mytf * myidf return tfidf def save(self): self.__dict__['_id'] = MiscDb.save(self.__dict__) tfidf = TfIdf() if __name__ == "__main__": d = Doc('acta-release') print d.stems #d.save() print 'asdf' d1 = Doc('acta-release') print d1.stems