Beispiel #1
0
def docView(request, doc=None, cutoff=10):
    if request.GET.get('cutoff', 0):
        cutoff = int(request.GET['cutoff'])
    if not doc or not cutoff:
        return render_to_response(
            'error.html', {'error': 'Missing document or wrong cutoff!'},
            context_instance=RequestContext(request))
    try:
        d = Doc(docid=doc, owner=request.user)
    except:
        raise Http404
    cont = d.body
    relDocs = Docs.find(
        {'_id': {
            '$in': list(d.getRelatedDocIds(cutoff=cutoff))
        }}, ['docid', 'title'])
    return render_to_response('docView.html', {
        'doc': d,
        'oid': d._id,
        'user': request.user,
        'content': cont,
        'related': relDocs,
        'cutoff': cutoff,
        'cutoffs': ','.join(cutoffSL(d, cutoff)),
        'len': d.getFrags(cutoff=cutoff).count()
    },
                              context_instance=RequestContext(request))
Beispiel #2
0
def pippi(request, refdoc=None):
    if not refdoc:
        return render_to_response('error.html',
                                  {'error': 'specify document: %s!' % refdoc},
                                  context_instance=RequestContext(request))
    refdoc = Doc(docid=refdoc)
    template_vars = pager(request, Docs.find({}, ['_id', 'docid']), 'docid',
                          False)
    docs = sorted([(doc['docid'], doc['_id'])
                   for doc in template_vars['data']])
    docslen = Docs.count()
    template_vars['docs'] = [{
        'id': doc.docid,
        'oid': str(doc._id),
        'indexed': doc.pippiDocsLen,
        'title': doc.title,
        'frags': doc.getFrags().count(),
        'pippies': len(doc.pippies),
        'job': not doc._id in refdoc.pippiDocs,
        'type': doc.type,
        'docs': len(doc.getRelatedDocIds()),
        'tags': doc.autoTags(25)
    } for doc in (Doc(docid=d) for d, oid in docs if not oid == refdoc._id)]
    template_vars['stats'] = getOverview()
    template_vars['refdoc'] = refdoc.docid
    template_vars['reftitle'] = refdoc.title
    template_vars['oid'] = str(refdoc._id)
    template_vars['starred'] = request.session.get('starred', set())
    return render_to_response('pippi.html',
                              template_vars,
                              context_instance=RequestContext(request))
Beispiel #3
0
def setTitle(request, docid):
    try:
        d = Doc(docid=docid)
    except:
        return HttpResponse('')
    if request.user.is_authenticated() and request.user.username==d.owner:
        d.title=request.POST.get('value')
        d.save()
        return HttpResponse(d.title)
    return HttpResponse(d.title)
Beispiel #4
0
def setTitle(request, docid):
    try:
        d = Doc(docid=docid)
    except:
        return HttpResponse('')
    if request.user.is_authenticated() and request.user.username == d.owner:
        d.title = request.POST.get('value')
        d.save()
        return HttpResponse(d.title)
    return HttpResponse(d.title)
Beispiel #5
0
def metaView(request,doc=None):
    if not doc:
        return render_to_response('error.html', {'error': 'Missing document!'}, context_instance=RequestContext(request))
    try:
        d = Doc(docid=doc)
    except:
        form = UploadForm({'docid': doc})
        return render_to_response('upload.html', { 'form': form, }, context_instance=RequestContext(request))

    relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=5))} }, ['docid','title'])
    return render_to_response('meta.html', {'doc': d,
                                            'related': relDocs,
                                            'metadata': d.metadata,
                                            }, context_instance=RequestContext(request))
Beispiel #6
0
def metaView(request,doc=None):
    if not doc:
        return render_to_response('error.html', {'error': 'Missing document!'}, context_instance=RequestContext(request))
    try:
        d = Doc(docid=doc, owner=request.user)
    except:
        raise Http404

    relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=5))} }, ['docid','title'])
    return render_to_response('meta.html', {'doc': d,
                                            'oid': d._id,
                                            'related': relDocs,
                                            'metadata': d.metadata,
                                            }, context_instance=RequestContext(request))
Beispiel #7
0
def delete(request, docid):
    try:
        d = Doc(docid=docid)
    except:
        return render_to_response('error.html',
                                  {'error': 'no such document: "%s"!' % docid},
                                  context_instance=RequestContext(request))
    if len(d.pippies)>0 or len(d.pippiDocs)>0:
        # TODO support cascade deletion in doc.py
        return render_to_response('error.html',
                                  {'error': 'This document has been pippied, and cannot be removed without causing missing references in the documents it has been pippied against!'},
                                  context_instance=RequestContext(request))
    if request.user.is_authenticated() and request.user.username==d.owner:
        d.delete()
    return HttpResponseRedirect('/browse')
Beispiel #8
0
def filterDocs(request):
    q = request.GET.get('q')
    query = {}
    if q:
        query = {'title': re.compile(q, re.I)}
    if request.GET.get('starred') == 'true':
        query['_id'] = {
            '$in': [ObjectId(x) for x in request.session.get('starred', ())]
        }
    if request.GET.get('mine') == 'true':
        query['owner'] = unicode(request.user)
    res = pager(request, Docs.find(query, sort=[('_id', pymongo.DESCENDING)]),
                'docid', False)
    starred = request.session.get('starred', set())
    res['docs'] = [{
        'id': doc.docid,
        'starred': u'\u2605' if str(doc._id) in starred else u'\u2606',
        'starclass': 'starred' if str(doc._id) in starred else '',
        'title': doc.title,
        'meta': doc.metadata,
        'oid': str(doc._id),
        'indexed': doc.pippiDocs,
        'pippies': len(doc.pippies),
        'type': doc.type,
        'tags': doc.autoTags(25),
    } for doc in (Doc(d=d) for d in res['data'])]
    return HttpResponse(jdump(res), mimetype="application/json")
Beispiel #9
0
def delete(request, docid):
    try:
        d = Doc(docid=docid)
    except:
        return render_to_response('error.html',
                                  {'error': 'no such document: "%s"!' % docid},
                                  context_instance=RequestContext(request))
    if len(d.pippies) > 0 or len(d.pippiDocs) > 0:
        # TODO support cascade deletion in doc.py
        return render_to_response('error.html', {
            'error':
            'This document has been pippied, and cannot be removed without causing missing references in the documents it has been pippied against!'
        },
                                  context_instance=RequestContext(request))
    if request.user.is_authenticated() and request.user.username == d.owner:
        d.delete()
    return HttpResponseRedirect('/browse')
Beispiel #10
0
def job(request):
    d1 = request.GET.get('d1', '')
    d2 = request.GET.get('d2', '')
    try:
        D1 = Doc(docid=d1, owner=request.user)
    except:
        return render_to_response('error.html',
                                  {'error': 'wrong document: "%s"!' % d1},
                                  context_instance=RequestContext(request))
    try:
        D2 = Doc(docid=d2, owner=request.user)
    except:
        return render_to_response('error.html',
                                  {'error': 'specify document: "%s"!' % d2},
                                  context_instance=RequestContext(request))
    lcs.pippi(D1, D2)
    return HttpResponseRedirect('/doc/%s' % (d1))
Beispiel #11
0
def jobs(request):
    rdoc = request.GET.get('doc')
    try:
        refdoc = Doc(oid=ObjectId(rdoc))
    except:
        return render_to_response('error.html',
                                  {'error': 'wrong document: "%s"!' % rdoc},
                                  context_instance=RequestContext(request))
    failed = []
    for doc in request.GET.getlist('ids'):
        if doc == rdoc: continue
        try:
            od = Doc(oid=ObjectId(doc))
        except:
            failed.append(doc)
            continue
        lcs.pippi(refdoc, od)
    return HttpResponseRedirect('/doc/%s' % (refdoc.docid))
Beispiel #12
0
def createDoc(request):
    form = UploadForm(request.POST)
    if not form.is_valid():
        return render_to_response('upload.html', { 'form': form, }, context_instance=RequestContext(request))
    doc=form.cleaned_data['doc']
    docid=form.cleaned_data['docid']
    raw=unicode(str(tidy.parseString(doc, **{'output_xhtml' : 1,
                                  'add_xml_decl' : 0,
                                  'indent' : 0,
                                  'tidy_mark' : 0,
                                  'doctype' : "strict",
                                  'wrap' : 0})),'utf8')
    d=Doc(raw=raw.encode('utf8'),docid=docid.encode('utf8'))
    if not 'stems' in d.__dict__ or not d.stems:
        # let's calculate and cache the results
        tfidf.add_input_document(d.termcnt.keys())
        d.save()
    return HttpResponseRedirect('/doc/%s' % (d.docid))
Beispiel #13
0
def metaView(request, doc=None):
    if not doc:
        return render_to_response('error.html', {'error': 'Missing document!'},
                                  context_instance=RequestContext(request))
    try:
        d = Doc(docid=doc, owner=request.user)
    except:
        raise Http404

    relDocs = Docs.find({'_id': {
        '$in': list(d.getRelatedDocIds(cutoff=5))
    }}, ['docid', 'title'])
    return render_to_response('meta.html', {
        'doc': d,
        'oid': d._id,
        'related': relDocs,
        'metadata': d.metadata,
    },
                              context_instance=RequestContext(request))
Beispiel #14
0
def docView(request,doc=None,cutoff=10):
    if request.GET.get('cutoff', 0):
        cutoff = int(request.GET['cutoff'])
    if not doc or not cutoff:
        return render_to_response('error.html', {'error': 'Missing document or wrong cutoff!'}, context_instance=RequestContext(request))
    try:
        d = Doc(docid=doc)
    except:
        form = UploadForm({'docid': doc})
        return render_to_response('upload.html', { 'form': form, }, context_instance=RequestContext(request))
    cont = d.body
    relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=cutoff))} }, ['docid','title'])
    return render_to_response('docView.html', {'doc': d,
                                               'oid': d._id,
                                               'user': request.user,
                                               'content': cont,
                                               'related': relDocs,
                                               'cutoff': cutoff,
                                               'cutoffs': ','.join(cutoffSL(d,cutoff)),
                                               'len': d.getFrags(cutoff=cutoff).count()}, context_instance=RequestContext(request))
Beispiel #15
0
def _listDocs(request, template_vars, tpl='corpus.html'):
    template_vars['docs'] = [{
        'id': doc.docid,
        'oid': str(doc._id),
        'indexed': doc.pippiDocsLen,
        'title': doc.title,
        'frags': doc.getFrags().count(),
        'pippies': len(doc.pippies),
        'type': doc.type,
        'docs': len(doc.getRelatedDocIds()),
        'tags': doc.autoTags(25)
    } for doc in (Doc(d=d) for d in template_vars['data'])]
    template_vars['stats'] = getOverview()
    template_vars['starred'] = request.session.get('starred', set())
    return render_to_response(tpl,
                              template_vars,
                              context_instance=RequestContext(request))
Beispiel #16
0
def search(request):
    q = cgi.escape(request.GET.get('q', ''))
    if not q:
        return render_to_response('error.html',
                                  {'error': 'Missing search query!'},
                                  context_instance=RequestContext(request))

    filtr = []
    lang = guessLanguage(q)
    swords = stopmap.stopmap.get(lang, stopmap.stopmap['en'])
    engine = getStemmer(lang)
    for word in nltk.tokenize.wordpunct_tokenize(unicode(q)):
        # stem each word
        stem = engine.stem(word.encode('utf8'))
        if stem and stem[0] not in swords and len(stem[0]) > 1:
            filtr.append(stem[0])
        else:
            filtr.append('')
    matches = [
        x['_id'] for x in DocStems.find({'value': {
            '$all': filtr
        }}, ['_id'])
    ]
    template_vars = pager(request, Docs.find({"stemsid": {
        '$in': matches
    }}), 'docid', False)
    template_vars['getparams'] = request.GET.urlencode()
    template_vars['q'] = q
    template_vars['stats'] = getOverview()
    template_vars['starred'] = request.session.get('starred', set())
    template_vars['docs'] = [{
        'id': doc.docid,
        'oid': str(doc._id),
        'indexed': doc.pippiDocsLen,
        'title': doc.title,
        'frags': doc.getFrags().count(),
        'pippies': len(doc.pippies),
        'type': doc.type,
        'docs': len(doc.getRelatedDocIds()),
        'tags': doc.autoTags(25)
    } for doc in (Doc(d=d) for d in template_vars['data'])]
    return render_to_response('search.html',
                              template_vars,
                              context_instance=RequestContext(request))
Beispiel #17
0
def main():
    print "updateing pippies.relevance"
    pippies=Pippies.find({},['docs','len'])
    pippieslen=pippies.count()
    i=1
    for pippi in pippies:
        if (i*100/pippieslen)!=((i-1)*100/pippieslen):
            if (i*100/pippieslen) % 10 == 0:
                sys.stdout.write("%d" % (i*100/pippieslen))
                sys.stdout.flush()
            else:
                sys.stdout.write('.')
                sys.stdout.flush()
        if len(pippi['docs'])>0:
                Pippies.update({'_id' : pippi['_id']},
                               { '$set': { 'relevance': float(pippi['len'])/float(len(pippi['docs'])),
                                           'docslen': len(pippi['docs']),}, })
        i=i+1
    sys.stdout.write('\n')
    sys.stdout.flush()

    print "updateing docs.idf"
    docs=Docs.find({},['termcnt','docid','stemsid','rawid'])
    docslen=docs.count()
    i=1
    for dd in docs:
        if (i*100/docslen)!=((i-1)*100/docslen):
            if (i*100/docslen) % 10 == 0:
                sys.stdout.write("%d" % (i*100/docslen))
                sys.stdout.flush()
            else:
                sys.stdout.write('.')
                sys.stdout.flush()
        Docs.update({'_id': dd['_id']},{ '$set': { 'tfidf': Doc(d=dd).tfidf } })
        i=i+1
    sys.stdout.write('\n')
    sys.stdout.flush()
Beispiel #18
0
        if StopFrags.isStopFrag(stem): continue
        if not stem in frags:
            frags[stem]=[l, [],[]]
        # append position to list appointed by sel
        sel=((end-l)/ld)>0
        frags[stem][1+sel].append(end-l-(sel*ld))

    if saver: saver.addDocs(D1,D2)
    return [(saver.save(D1,D2,bulksaver.lcsPkt(sorted(a),sorted(b),l,stem,D1,D2))
             if saver
             else (l,stem,sorted(a),sorted(b)))
            for stem, (l, a, b)
            in frags.items()
            if a and b]

if __name__ == "__main__":
    #from lenx.view.Eurlex import Doc
    from lenx.view.doc import Doc
    import pprint
    import sys
    #frag=LCS(doc1,doc2)
    #pprint.pprint(frag.root)

    d1=Doc(docid=sys.argv[1].strip('\t\n'))
    pprint.pprint(d1.extractMetadata())
    d2=Doc(docid=sys.argv[2].strip('\t\n'))
    pprint.pprint(d2.extractMetadata())
    #pips=pippi(Doc(docid=sys.argv[1].strip('\t\n')),Doc(docid=sys.argv[2].strip('\t\n')))
    #print len(pips)
    #pprint.pprint(pips)
Beispiel #19
0
def docView(request,doc=None,cutoff=10):
    if request.GET.get('cutoff', 0):
        cutoff = int(request.GET['cutoff'])
    if not doc or not cutoff:
        return render_to_response('error.html', {'error': 'Missing document or wrong cutoff!'}, context_instance=RequestContext(request))
    try:
        d = Doc(docid=doc)
    except:
        form = UploadForm({'docid': doc})
        return render_to_response('upload.html', { 'form': form, }, context_instance=RequestContext(request))
    tooltips={}
    cont = d.body
    relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=cutoff))} }, ['docid','title'])
    ls = []
    matches = 0
    for l in d.getFrags(cutoff=cutoff):
        if( l['l'] < cutoff): break
        # for unique locset - optimalization?!
        if l['txt'] in ls:
            continue
        ls.append(l['txt'])
        t = l['txt']
        # for valid matches
        btxt = ''
        etxt = ''
        if t[0][0].isalnum():
            btxt = '\W'
        if t[-1][-1].isalnum():
            etxt = '\W'
        rtxt = btxt+'\s*(?:<[^>]*>\s*)*'.join([re.escape(x) for x in t])+etxt
        regex=re.compile(rtxt, re.I | re.M | re.U)
        i=0
        offset = 0
        #print "[!] Finding: %s\n\tPos: %s\n\t%s\n" % (' '.join(t), l['pos'], rtxt)
        if not l['pippi'] in tooltips:
            tooltips[l['pippi']]=annotatePippi(d,l,cutoff)
        for r in regex.finditer(cont):
            #print '[!] Match: %s\n\tStartpos: %d\n\tEndpos: %d' % (r.group(), r.start(), r.end())
            span = (('<span class="highlight %s">') % l['pippi'], '</span>')
            start = r.start()+offset
            if btxt:
                start += 1
            end = r.end()+offset
            if etxt:
                end -= 1
            match, n = re.compile(r'((?:\s*<[^>]+>)+)', re.M | re.U).subn(r'%s\1%s' % (span[1], span[0]), cont[start:end])
            cont = cont[:start]+span[0]+match+span[1]+cont[end:]
            offset += (n+1)*(len(span[0])+len(span[1]))
            matches += 1
            #print '_'*60
        #print '-'*120
    cont=anchorArticles(cont)
    #print "[!] Rendering\n\tContent length: %d" % len(cont)
    return render_to_response('docView.html', {'doc': d,
                                               'oid': d._id,
                                               'user': request.user,
                                               'content': cont,
                                               'related': relDocs,
                                               'cutoff': cutoff,
                                               'cutoffs': ','.join(cutoffSL(d,cutoff)),
                                               'len': len(ls),
                                               'tooltips': '\n'.join(tooltips.values()),
                                               'matches': matches}, context_instance=RequestContext(request))
Beispiel #20
0
            float(1 + self.num_docs) / (1 + self.term_num_docs[term]))

    def get_doc_keywords(self, doc):
        """Retrieve terms and corresponding tf-idf for the specified document.
        The returned terms are ordered by decreasing tf-idf.
        """
        tfidf = {}
        doclen = len(doc.stems)
        for word in doc.termcnt:
            # The definition of TF specifies the denominator as the count of terms
            # within the document, but for short documents, I've found heuristically
            # that sometimes len(tokens_set) yields more intuitive results.
            mytf = float(doc.termcnt[word]) / doclen
            myidf = self.get_idf(word)
            tfidf[word] = mytf * myidf
        return tfidf

    def save(self):
        self.__dict__['_id'] = MiscDb.save(self.__dict__)


tfidf = TfIdf()

if __name__ == "__main__":
    d = Doc('acta-release')
    print d.stems
    #d.save()
    print 'asdf'
    d1 = Doc('acta-release')
    print d1.stems
Beispiel #21
0
def frags(request):
    filtr = {}
    template_vars = {}
    docfilter = None
    cutoff = None
    pippifilter = None
    try:
        docfilter = ObjectId(cgi.escape(request.GET.get('doc', '')))
    except:
        pass
    if docfilter:
        filtr['doc'] = docfilter
    try:
        pippifilter = ObjectId(cgi.escape(request.GET.get('pippi', '')))
    except:
        pass
    if pippifilter:
        filtr['pippi'] = pippifilter
    else:
        try:
            cutoff = int(cgi.escape(request.GET.get('cutoff', '7')))
        except:
            pass
    if cutoff: filtr['l'] = {'$gte': cutoff}
    orderBy = 'l'
    orderDesc = True
    template_vars = pager(request, Frags.find(filtr), orderBy, orderDesc)
    prevDoc = None
    template_vars['frags'] = []
    for frag in template_vars['data']:
        p = Pippi('', oid=frag['pippi'])
        d = Doc(oid=frag['doc'])
        if pippifilter:
            frag['txt'] = diffFrag(prevDoc, frag['txt'])
            prevDoc = frag['txt']
        template_vars['frags'].append({
            '_id':
            frag['_id'],
            'pos':
            frag['pos'],
            'txt':
            " ".join(frag['txt']),
            'len':
            frag['l'],
            'score':
            sum([d.tfidf.get(t, 0) for t in p.pippi]),
            'pippi':
            p,
            'doc':
            d,
        })

    template_vars['pippi'] = pippifilter
    template_vars['doc'] = docfilter
    if docfilter:
        template_vars['docTitle'] = Docs.find_one({'_id': docfilter},
                                                  ['docid'])['docid']
    if pippifilter:
        template_vars[
            'pippiFilter'] = 1  #" ".join(Pippies.find_one({'_id': pippifilter},['pippi'])['pippi'])
    return render_to_response('frags.html',
                              template_vars,
                              context_instance=RequestContext(request))
Beispiel #22
0
 def _getScore(self):
     d = Doc(oid=self.doc)
     p = Pippi('', oid=self.pippi)
     return sum([d.tfidf.get(t, 0) for t in p.pippi])
Beispiel #23
0
 def getDocs(self, d, cutoff=7):
     return set([Doc(oid=oid) for oid in self.docs if oid != d._id])
Beispiel #24
0
        if not stem in frags:
            frags[stem] = [l, [], []]
        # append position to list appointed by sel
        sel = ((end - l) / ld) > 0
        frags[stem][1 + sel].append(end - l - (sel * ld))

    if saver:
        saver.addDocs(D1, D2)
        # also save as annotations
        saveNotes(D1, D2, frags, settings.ROOT_URL)
    return [(saver.save(
        D1, D2, bulksaver.lcsPkt(sorted(a), sorted(b), l, stem, D1, D2))
             if saver else (l, stem, sorted(a), sorted(b)))
            for stem, (l, a, b) in frags.items() if a and b]


if __name__ == "__main__":
    #from lenx.view.eurlex import Doc
    from lenx.view.doc import Doc
    #import pprint
    import sys
    #frag=LCS(doc1,doc2)
    #pprint.pprint(frag.root)

    d1 = Doc(docid=sys.argv[1].strip('\t\n'))
    #pprint.pprint(d1.extractMetadata())
    d2 = Doc(docid=sys.argv[2].strip('\t\n'))
    #pprint.pprint(d2.extractMetadata())
    pips = pippi(d1, d2, None)
    #pprint.pprint(pips)