Beispiel #1
0
def filterDocs(request):
    q=request.GET.get('q')
    query={}
    if q:
        query={'title': re.compile(q, re.I)}
    if request.GET.get('starred')=='true':
        query['_id']={ '$in': [ObjectId(x)
                               for x in request.session.get('starred',())] }
    if request.GET.get('mine')=='true':
        query['owner']=unicode(request.user)
    res=pager(request,Docs.find(query, sort=[('_id',pymongo.DESCENDING)]),'docid',False)
    starred=request.session.get('starred',set())
    res['docs']=[{'id': doc.docid,
                  'starred': u'\u2605' if str(doc._id) in starred else u'\u2606',
                  'starclass': 'starred' if str(doc._id) in starred else '',
                  'title': doc.title,
                  'meta': doc.metadata,
                  'oid': str(doc._id),
                  'indexed': doc.pippiDocs,
                  'pippies': len(doc.pippies),
                  'type': doc.type,
                  'tags': doc.autoTags(25),
                  }
                 for doc in (Doc(d=d) for d in res['data'])]
    return HttpResponse(jdump(res),mimetype="application/json")
Beispiel #2
0
def filterDocs(request):
    q = request.GET.get('q')
    query = {}
    if q:
        query = {'title': re.compile(q, re.I)}
    if request.GET.get('starred') == 'true':
        query['_id'] = {
            '$in': [ObjectId(x) for x in request.session.get('starred', ())]
        }
    if request.GET.get('mine') == 'true':
        query['owner'] = unicode(request.user)
    res = pager(request, Docs.find(query, sort=[('_id', pymongo.DESCENDING)]),
                'docid', False)
    starred = request.session.get('starred', set())
    res['docs'] = [{
        'id': doc.docid,
        'starred': u'\u2605' if str(doc._id) in starred else u'\u2606',
        'starclass': 'starred' if str(doc._id) in starred else '',
        'title': doc.title,
        'meta': doc.metadata,
        'oid': str(doc._id),
        'indexed': doc.pippiDocs,
        'pippies': len(doc.pippies),
        'type': doc.type,
        'tags': doc.autoTags(25),
    } for doc in (Doc(d=d) for d in res['data'])]
    return HttpResponse(jdump(res), mimetype="application/json")
Beispiel #3
0
def pippi(request, refdoc=None):
    if not refdoc:
        return render_to_response('error.html',
                                  {'error': 'specify document: %s!' % refdoc},
                                  context_instance=RequestContext(request))
    refdoc = Doc(docid=refdoc)
    template_vars = pager(request, Docs.find({}, ['_id', 'docid']), 'docid',
                          False)
    docs = sorted([(doc['docid'], doc['_id'])
                   for doc in template_vars['data']])
    docslen = Docs.count()
    template_vars['docs'] = [{
        'id': doc.docid,
        'oid': str(doc._id),
        'indexed': doc.pippiDocsLen,
        'title': doc.title,
        'frags': doc.getFrags().count(),
        'pippies': len(doc.pippies),
        'job': not doc._id in refdoc.pippiDocs,
        'type': doc.type,
        'docs': len(doc.getRelatedDocIds()),
        'tags': doc.autoTags(25)
    } for doc in (Doc(docid=d) for d, oid in docs if not oid == refdoc._id)]
    template_vars['stats'] = getOverview()
    template_vars['refdoc'] = refdoc.docid
    template_vars['reftitle'] = refdoc.title
    template_vars['oid'] = str(refdoc._id)
    template_vars['starred'] = request.session.get('starred', set())
    return render_to_response('pippi.html',
                              template_vars,
                              context_instance=RequestContext(request))
Beispiel #4
0
def docView(request, doc=None, cutoff=10):
    if request.GET.get('cutoff', 0):
        cutoff = int(request.GET['cutoff'])
    if not doc or not cutoff:
        return render_to_response(
            'error.html', {'error': 'Missing document or wrong cutoff!'},
            context_instance=RequestContext(request))
    try:
        d = Doc(docid=doc, owner=request.user)
    except:
        raise Http404
    cont = d.body
    relDocs = Docs.find(
        {'_id': {
            '$in': list(d.getRelatedDocIds(cutoff=cutoff))
        }}, ['docid', 'title'])
    return render_to_response('docView.html', {
        'doc': d,
        'oid': d._id,
        'user': request.user,
        'content': cont,
        'related': relDocs,
        'cutoff': cutoff,
        'cutoffs': ','.join(cutoffSL(d, cutoff)),
        'len': d.getFrags(cutoff=cutoff).count()
    },
                              context_instance=RequestContext(request))
Beispiel #5
0
def starred(request):
    template_vars=pager(request,
                        Docs.find({'_id' :
                                   { '$in': [ObjectId(x)
                                             for x in request.session.get('starred',())] }},
                                  sort=[('docid',pymongo.ASCENDING)]),
                        'docid',False)
    template_vars['title']='Your starred documents'
    return _listDocs(request, template_vars)
Beispiel #6
0
def starred(request):
    template_vars = pager(
        request,
        Docs.find(
            {
                '_id': {
                    '$in':
                    [ObjectId(x) for x in request.session.get('starred', ())]
                }
            },
            sort=[('docid', pymongo.ASCENDING)]), 'docid', False)
    template_vars['title'] = 'Your starred documents'
    return _listDocs(request, template_vars)
Beispiel #7
0
def metaView(request,doc=None):
    if not doc:
        return render_to_response('error.html', {'error': 'Missing document!'}, context_instance=RequestContext(request))
    try:
        d = Doc(docid=doc)
    except:
        form = UploadForm({'docid': doc})
        return render_to_response('upload.html', { 'form': form, }, context_instance=RequestContext(request))

    relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=5))} }, ['docid','title'])
    return render_to_response('meta.html', {'doc': d,
                                            'related': relDocs,
                                            'metadata': d.metadata,
                                            }, context_instance=RequestContext(request))
Beispiel #8
0
def metaView(request,doc=None):
    if not doc:
        return render_to_response('error.html', {'error': 'Missing document!'}, context_instance=RequestContext(request))
    try:
        d = Doc(docid=doc, owner=request.user)
    except:
        raise Http404

    relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=5))} }, ['docid','title'])
    return render_to_response('meta.html', {'doc': d,
                                            'oid': d._id,
                                            'related': relDocs,
                                            'metadata': d.metadata,
                                            }, context_instance=RequestContext(request))
Beispiel #9
0
def starred(request):
    template_vars=pager(request,Docs.find({'_id' : { '$in': [ObjectId(x) for x in request.session.get('starred',())] }},['_id','docid']),'docid',False)
    docs=[(doc['docid'],doc['_id']) for doc in template_vars['data']]
    docslen=Docs.count()
    template_vars['docs']=[{'id': doc.docid,
                            'oid': str(doc._id),
                            'indexed': doc.pippiDocsLen,
                            'title': doc.title,
                            'frags': doc.getFrags().count(),
                            'pippies': len(doc.pippies),
                            'type': doc.type,
                            'docs': len(doc.getRelatedDocIds()),
                            'tags': doc.autoTags(25) }
                           for doc in (Doc(docid=d) for d,oid in docs)]
    template_vars['stats']=getOverview()
    template_vars['starred']=request.session.get('starred',set())
    template_vars['title']='Your starred documents'
    return render_to_response('corpus.html', template_vars, context_instance=RequestContext(request))
Beispiel #10
0
def search(request):
    q = cgi.escape(request.GET.get('q', ''))
    if not q:
        return render_to_response('error.html',
                                  {'error': 'Missing search query!'},
                                  context_instance=RequestContext(request))

    filtr = []
    lang = guessLanguage(q)
    swords = stopmap.stopmap.get(lang, stopmap.stopmap['en'])
    engine = getStemmer(lang)
    for word in nltk.tokenize.wordpunct_tokenize(unicode(q)):
        # stem each word
        stem = engine.stem(word.encode('utf8'))
        if stem and stem[0] not in swords and len(stem[0]) > 1:
            filtr.append(stem[0])
        else:
            filtr.append('')
    matches = [
        x['_id'] for x in DocStems.find({'value': {
            '$all': filtr
        }}, ['_id'])
    ]
    template_vars = pager(request, Docs.find({"stemsid": {
        '$in': matches
    }}), 'docid', False)
    template_vars['getparams'] = request.GET.urlencode()
    template_vars['q'] = q
    template_vars['stats'] = getOverview()
    template_vars['starred'] = request.session.get('starred', set())
    template_vars['docs'] = [{
        'id': doc.docid,
        'oid': str(doc._id),
        'indexed': doc.pippiDocsLen,
        'title': doc.title,
        'frags': doc.getFrags().count(),
        'pippies': len(doc.pippies),
        'type': doc.type,
        'docs': len(doc.getRelatedDocIds()),
        'tags': doc.autoTags(25)
    } for doc in (Doc(d=d) for d in template_vars['data'])]
    return render_to_response('search.html',
                              template_vars,
                              context_instance=RequestContext(request))
Beispiel #11
0
def metaView(request, doc=None):
    if not doc:
        return render_to_response('error.html', {'error': 'Missing document!'},
                                  context_instance=RequestContext(request))
    try:
        d = Doc(docid=doc, owner=request.user)
    except:
        raise Http404

    relDocs = Docs.find({'_id': {
        '$in': list(d.getRelatedDocIds(cutoff=5))
    }}, ['docid', 'title'])
    return render_to_response('meta.html', {
        'doc': d,
        'oid': d._id,
        'related': relDocs,
        'metadata': d.metadata,
    },
                              context_instance=RequestContext(request))
Beispiel #12
0
def docView(request,doc=None,cutoff=10):
    if request.GET.get('cutoff', 0):
        cutoff = int(request.GET['cutoff'])
    if not doc or not cutoff:
        return render_to_response('error.html', {'error': 'Missing document or wrong cutoff!'}, context_instance=RequestContext(request))
    try:
        d = Doc(docid=doc)
    except:
        form = UploadForm({'docid': doc})
        return render_to_response('upload.html', { 'form': form, }, context_instance=RequestContext(request))
    cont = d.body
    relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=cutoff))} }, ['docid','title'])
    return render_to_response('docView.html', {'doc': d,
                                               'oid': d._id,
                                               'user': request.user,
                                               'content': cont,
                                               'related': relDocs,
                                               'cutoff': cutoff,
                                               'cutoffs': ','.join(cutoffSL(d,cutoff)),
                                               'len': d.getFrags(cutoff=cutoff).count()}, context_instance=RequestContext(request))
Beispiel #13
0
def main():
    print "updateing pippies.relevance"
    pippies=Pippies.find({},['docs','len'])
    pippieslen=pippies.count()
    i=1
    for pippi in pippies:
        if (i*100/pippieslen)!=((i-1)*100/pippieslen):
            if (i*100/pippieslen) % 10 == 0:
                sys.stdout.write("%d" % (i*100/pippieslen))
                sys.stdout.flush()
            else:
                sys.stdout.write('.')
                sys.stdout.flush()
        if len(pippi['docs'])>0:
                Pippies.update({'_id' : pippi['_id']},
                               { '$set': { 'relevance': float(pippi['len'])/float(len(pippi['docs'])),
                                           'docslen': len(pippi['docs']),}, })
        i=i+1
    sys.stdout.write('\n')
    sys.stdout.flush()

    print "updateing docs.idf"
    docs=Docs.find({},['termcnt','docid','stemsid','rawid'])
    docslen=docs.count()
    i=1
    for dd in docs:
        if (i*100/docslen)!=((i-1)*100/docslen):
            if (i*100/docslen) % 10 == 0:
                sys.stdout.write("%d" % (i*100/docslen))
                sys.stdout.flush()
            else:
                sys.stdout.write('.')
                sys.stdout.flush()
        Docs.update({'_id': dd['_id']},{ '$set': { 'tfidf': Doc(d=dd).tfidf } })
        i=i+1
    sys.stdout.write('\n')
    sys.stdout.flush()
Beispiel #14
0
def search(request):
    q = cgi.escape(request.GET.get('q',''))
    if not q:
        return render_to_response('error.html', {'error': 'Missing search query!'}, context_instance=RequestContext(request))

    filtr=[]
    lang=guessLanguage(q)
    swords=stopmap.stopmap.get(lang,stopmap.stopmap['en'])
    engine=getStemmer(lang)
    for word in nltk.tokenize.wordpunct_tokenize(unicode(q)):
        # stem each word
        stem=engine.stem(word.encode('utf8'))
        if stem and stem[0] not in swords and len(stem[0])>1:
            filtr.append(stem[0])
        else:
            filtr.append('')
    matches=[x['_id'] for x in DocStems.find({'value': { '$all' : filtr }},['_id'])]
    template_vars=pager(request,
                        Docs.find({"stemsid": { '$in': matches}}),
                        'docid',
                        False)
    template_vars['getparams']=request.GET.urlencode()
    template_vars['q']=q
    template_vars['stats']=getOverview()
    template_vars['starred']=request.session.get('starred',set())
    template_vars['docs']=[{'id': doc.docid,
                            'oid': str(doc._id),
                            'indexed': doc.pippiDocsLen,
                            'title': doc.title,
                            'frags': doc.getFrags().count(),
                            'pippies': len(doc.pippies),
                            'type': doc.type,
                            'docs': len(doc.getRelatedDocIds()),
                            'tags': doc.autoTags(25) }
                           for doc in (Doc(d=d) for d in template_vars['data'])]
    return render_to_response('search.html', template_vars, context_instance=RequestContext(request))
Beispiel #15
0
def pippi(request,refdoc=None):
    if not refdoc:
        return render_to_response('error.html', {'error': 'specify document: %s!' % refdoc}, context_instance=RequestContext(request))
    refdoc=Doc(docid=refdoc)
    template_vars=pager(request,Docs.find({},['_id','docid']),'docid',False)
    docs=sorted([(doc['docid'],doc['_id']) for doc in template_vars['data']])
    docslen=Docs.count()
    template_vars['docs']=[{'id': doc.docid,
                            'oid': str(doc._id),
                            'indexed': doc.pippiDocsLen,
                            'title': doc.title,
                            'frags': doc.getFrags().count(),
                            'pippies': len(doc.pippies),
                            'job': not doc._id in refdoc.pippiDocs,
                            'type': doc.type,
                            'docs': len(doc.getRelatedDocIds()),
                            'tags': doc.autoTags(25) }
                           for doc in (Doc(docid=d) for d,oid in docs if not oid == refdoc._id)]
    template_vars['stats']=getOverview()
    template_vars['refdoc']=refdoc.docid
    template_vars['reftitle']=refdoc.title
    template_vars['oid']=str(refdoc._id)
    template_vars['starred']=request.session.get('starred',set())
    return render_to_response('pippi.html', template_vars, context_instance=RequestContext(request))
Beispiel #16
0
def docView(request,doc=None,cutoff=10):
    if request.GET.get('cutoff', 0):
        cutoff = int(request.GET['cutoff'])
    if not doc or not cutoff:
        return render_to_response('error.html', {'error': 'Missing document or wrong cutoff!'}, context_instance=RequestContext(request))
    try:
        d = Doc(docid=doc)
    except:
        form = UploadForm({'docid': doc})
        return render_to_response('upload.html', { 'form': form, }, context_instance=RequestContext(request))
    tooltips={}
    cont = d.body
    relDocs = Docs.find({'_id': { '$in': list(d.getRelatedDocIds(cutoff=cutoff))} }, ['docid','title'])
    ls = []
    matches = 0
    for l in d.getFrags(cutoff=cutoff):
        if( l['l'] < cutoff): break
        # for unique locset - optimalization?!
        if l['txt'] in ls:
            continue
        ls.append(l['txt'])
        t = l['txt']
        # for valid matches
        btxt = ''
        etxt = ''
        if t[0][0].isalnum():
            btxt = '\W'
        if t[-1][-1].isalnum():
            etxt = '\W'
        rtxt = btxt+'\s*(?:<[^>]*>\s*)*'.join([re.escape(x) for x in t])+etxt
        regex=re.compile(rtxt, re.I | re.M | re.U)
        i=0
        offset = 0
        #print "[!] Finding: %s\n\tPos: %s\n\t%s\n" % (' '.join(t), l['pos'], rtxt)
        if not l['pippi'] in tooltips:
            tooltips[l['pippi']]=annotatePippi(d,l,cutoff)
        for r in regex.finditer(cont):
            #print '[!] Match: %s\n\tStartpos: %d\n\tEndpos: %d' % (r.group(), r.start(), r.end())
            span = (('<span class="highlight %s">') % l['pippi'], '</span>')
            start = r.start()+offset
            if btxt:
                start += 1
            end = r.end()+offset
            if etxt:
                end -= 1
            match, n = re.compile(r'((?:\s*<[^>]+>)+)', re.M | re.U).subn(r'%s\1%s' % (span[1], span[0]), cont[start:end])
            cont = cont[:start]+span[0]+match+span[1]+cont[end:]
            offset += (n+1)*(len(span[0])+len(span[1]))
            matches += 1
            #print '_'*60
        #print '-'*120
    cont=anchorArticles(cont)
    #print "[!] Rendering\n\tContent length: %d" % len(cont)
    return render_to_response('docView.html', {'doc': d,
                                               'oid': d._id,
                                               'user': request.user,
                                               'content': cont,
                                               'related': relDocs,
                                               'cutoff': cutoff,
                                               'cutoffs': ','.join(cutoffSL(d,cutoff)),
                                               'len': len(ls),
                                               'tooltips': '\n'.join(tooltips.values()),
                                               'matches': matches}, context_instance=RequestContext(request))
Beispiel #17
0
def listDocs(request):
    template_vars=pager(request,Docs.find(sort=[('docid',pymongo.DESCENDING)]),'docid',False)
    template_vars['title']='Complete Corpus of pippi longstrings'
    return _listDocs(request, template_vars)
Beispiel #18
0
def listDocs(request):
    template_vars = pager(request,
                          Docs.find(sort=[('docid', pymongo.DESCENDING)]),
                          'docid', False)
    template_vars['title'] = 'Complete Corpus of pippi longstrings'
    return _listDocs(request, template_vars)