Beispiel #1
0
def cutoffSL(doc, cutoff):
    m=Code("function(){ emit( this.len , { count : 1 } );}")
    r=Code("function (key, values) { var count = 0; values.forEach(function (v) {count += v.count;}); return {count: count}; }")
    if Pippies.count()>0:
        lens=dict([(x['_id'],int(x['value']['count'])) for x in Pippies.map_reduce(m,r,'cutoff sparkline', query={'docs': doc._id }).find()])
    else:
        lens={}
    if lens.keys():
        return [str(lens[x]) if x in lens else '0' for x in xrange(int(max(lens.keys())+1))][4:cutoff]
    else:
        return []
Beispiel #2
0
def getOverview():
    stats=[]
    stats.append({'title': 'Total documents',
                  'value': Docs.count(),
                  'text': "%s Documents" % Docs.count()})
    stats.append({'title':
                  'Total Pippies',
                  'value': Pippies.count(),
                  'text': "with %s Pippies" % Pippies.count()})
    stats.append({'title': 'Locations',
                  'value': Frags.count(),
                  'text': "in %s Locations" % Frags.count()})
    return stats
Beispiel #3
0
 def save(self,d1,d2,pkt):
     # todo new code to directly addtoset mongo-style
     if not pkt: return
     pippi=Pippi(pkt['pippi'])
     Docs.update({'_id': d1._id},
                 { '$addToSet' : { 'pippies' : pippi._id } })
     Docs.update({'_id': d2._id},
                 { '$addToSet' : { 'pippies' : pippi._id } })
     Pippies.update({'_id' : pippi._id},
                    {'$addToSet': { 'docs' : { '$each' : [d for d in [d1._id, d2._id]]}},
                     '$inc' : { 'docslen' : 2 }})
     [Frags.save({'pos': p['pos'], 'txt': p['txt'], 'l': pkt['l'], 'doc': d, 'pippi': pippi._id})
                 for (d,p) in
                 [(d1._id, p) for p in pkt['d1ps']]+[(d2._id, p) for p in pkt['d2ps']]]
     return pkt
Beispiel #4
0
 def __init__(self, pippi, oid=None, frag=None):
     if oid:
         # get by mongo oid
         frag=Pippies.find_one({"_id": oid})
     elif pippi:
         # get by pippi
         frag=Pippies.find_one({"pippi": ' '.join(pippi)})
     if(frag):
         self.__dict__=frag
         self.pippi=tuple(self.pippi.split(" "))
     else:
         self.__dict__={'pippi': tuple(pippi),
                        'len': len(pippi),
                        'docs': []} # should a be a set of {'pos':p,'txt':txt,'l':l,'doc':_id}
         self.save()
Beispiel #5
0
 def getRelatedDocIds(self, cutoff=7):
     return set([doc
                 for pippi in Pippies.find({'len': { '$gte': int(cutoff)},
                                            'docs': self._id},
                                           ['docs'])
                 for doc in pippi['docs']
                 if doc != self._id])
Beispiel #6
0
 def getRelatedDocIds(self, cutoff=7):
     return set([doc
                 for pippi in Pippies.find({'len': { '$gte': int(cutoff)},
                                            'docs': self._id},
                                           ['docs'])
                 for doc in pippi['docs']
                 if doc != self._id])
Beispiel #7
0
def search(request):
    q = cgi.escape(request.GET.get('q',''))
    if not q:
        return render_to_response('error.html', {'error': 'Missing search query!'}, context_instance=RequestContext(request))

    orderBy = cgi.escape(request.GET.get('orderby',''))
    # TODO also order by docslen (need to add that to bulksaver)
    if not orderBy in ['relevance', 'docslen', 'len', ]: orderBy='len'
    # TODO also handle desc/asc via the tableheader on the web ui
    orderDesc = True
    engine = hunspell.HunSpell(settings.DICT+'.dic', settings.DICT+'.aff')
    filtr=[]
    for word in [token for token in nltk.tokenize.wordpunct_tokenize(unicode(q))]:
        # stem each word
        stem=engine.stem(word.encode('utf8'))
        if stem:
            filtr.append(stem[0])
        else:
            filtr.append('')
    template_vars=pager(request,Pippies.find({'pippi': re.compile(' '.join(filtr))}),orderBy,orderDesc)
    template_vars['pippies']=[{'id': pippi['_id'],
                               'pippi':'%s<span class="hilite-query">%s</span>%s' % ' '.join([p if p else '*' for p in pippi['pippi'].split(' ')]).partition(' '.join([p if p else '*' for p in filtr])),
                               'docslen':pippi['docslen'],
                               'len':len(pippi['pippi'].split(' ')),
                               'relevance':pippi.get('relevance',0),}
                               for pippi in template_vars['data']]
    template_vars['getparams']=request.GET.urlencode()
    template_vars['q']=q
    return render_to_response('search.html', template_vars, context_instance=RequestContext(request))
Beispiel #8
0
 def __init__(self, pippi, oid=None, frag=None):
     if oid:
         # get by mongo oid
         frag = Pippies.find_one({"_id": oid})
     elif pippi:
         # get by pippi
         frag = Pippies.find_one({"pippi": ' '.join(pippi)})
     if (frag):
         self.__dict__ = frag
         self.pippi = tuple(self.pippi.split(" "))
     else:
         self.__dict__ = {
             'pippi': tuple(pippi),
             'len': len(pippi),
             'docs': []
         }  # should a be a set of {'pos':p,'txt':txt,'l':l,'doc':_id}
         self.save()
Beispiel #9
0
def getOverview():
    stats = []
    stats.append({
        'title': 'Total documents',
        'value': Docs.count(),
        'text': "%s Documents" % Docs.count()
    })
    stats.append({
        'title': 'Total Pippies',
        'value': Pippies.count(),
        'text': "with %s Pippies" % Pippies.count()
    })
    stats.append({
        'title': 'Locations',
        'value': Frags.count(),
        'text': "in %s Locations" % Frags.count()
    })
    return stats
Beispiel #10
0
def cutoffSL(doc, cutoff):
    m = Code("function(){ emit( this.len , { count : 1 } );}")
    r = Code(
        "function (key, values) { var count = 0; values.forEach(function (v) {count += v.count;}); return {count: count}; }"
    )
    if Pippies.count() > 0:
        lens = dict([(x['_id'], int(x['value']['count']))
                     for x in Pippies.map_reduce(
                         m, r, 'cutoff sparkline', query={
                             'docs': doc._id
                         }).find()])
    else:
        lens = {}
    if lens.keys():
        return [
            str(lens[x]) if x in lens else '0'
            for x in xrange(int(max(lens.keys()) + 1))
        ][4:cutoff]
    else:
        return []
Beispiel #11
0
def main():
    print "updateing pippies.relevance"
    pippies=Pippies.find({},['docs','len'])
    pippieslen=pippies.count()
    i=1
    for pippi in pippies:
        if (i*100/pippieslen)!=((i-1)*100/pippieslen):
            if (i*100/pippieslen) % 10 == 0:
                sys.stdout.write("%d" % (i*100/pippieslen))
                sys.stdout.flush()
            else:
                sys.stdout.write('.')
                sys.stdout.flush()
        if len(pippi['docs'])>0:
                Pippies.update({'_id' : pippi['_id']},
                               { '$set': { 'relevance': float(pippi['len'])/float(len(pippi['docs'])),
                                           'docslen': len(pippi['docs']),}, })
        i=i+1
    sys.stdout.write('\n')
    sys.stdout.flush()

    print "updateing docs.idf"
    docs=Docs.find({},['termcnt','docid','stemsid','rawid'])
    docslen=docs.count()
    i=1
    for dd in docs:
        if (i*100/docslen)!=((i-1)*100/docslen):
            if (i*100/docslen) % 10 == 0:
                sys.stdout.write("%d" % (i*100/docslen))
                sys.stdout.flush()
            else:
                sys.stdout.write('.')
                sys.stdout.flush()
        Docs.update({'_id': dd['_id']},{ '$set': { 'tfidf': Doc(d=dd).tfidf } })
        i=i+1
    sys.stdout.write('\n')
    sys.stdout.flush()
Beispiel #12
0
def pippies(request):
    filtr = {}
    template_vars = {}
    docfilter = None
    relfilter = None
    cutoff = None
    try:
        cutoff = int(cgi.escape(request.GET.get('cutoff', '7')))
    except:
        pass
    if cutoff: filtr['len'] = {'$gte': cutoff}
    try:
        docfilter = ObjectId(cgi.escape(request.GET.get('doc', '')))
    except:
        pass
    if docfilter:
        filtr['docs'] = docfilter
    try:
        relfilter = int(cgi.escape(request.GET.get('relevance', '')))
    except:
        pass
    if relfilter: filtr['relevance'] = relfilter
    # todo add sortable column headers ala http://djangosnippets.org/snippets/308/
    orderBy = cgi.escape(request.GET.get('orderby', 'relevance'))
    orderDesc = True if '1' == cgi.escape(request.GET.get('desc',
                                                          '1')) else False
    template_vars = pager(request, Pippies.find(filtr), orderBy, orderDesc)
    template_vars['pippies'] = [{
        'id':
        pippi['_id'],
        'pippi':
        ' '.join([p if p else '*' for p in pippi['pippi'].split(' ')]),
        'docslen':
        len(pippi['docs']),
        'relevance':
        pippi.get('relevance', 0),
    } for pippi in template_vars['data']]
    template_vars['doc'] = docfilter
    if docfilter:
        doc = Docs.find_one({'_id': docfilter}, ['docid', 'title'])
        template_vars[
            'docTitle'] = doc['title'] if 'title' in doc else doc['docid']
    return render_to_response('pippies.html',
                              template_vars,
                              context_instance=RequestContext(request))
Beispiel #13
0
def pippies(request):
    filtr={}
    template_vars={}
    docfilter=None
    relfilter=None
    cutoff=None
    try:
        cutoff = int(cgi.escape(request.GET.get('cutoff','7')))
    except:
        pass
    if cutoff: filtr['len']={ '$gte': cutoff }
    try:
        docfilter = ObjectId(cgi.escape(request.GET.get('doc','')))
    except:
        pass
    if docfilter:
        filtr['docs']=docfilter
    try:
        relfilter =  int(cgi.escape(request.GET.get('relevance','')))
    except:
        pass
    if relfilter: filtr['relevance']=relfilter
    # todo add sortable column headers ala http://djangosnippets.org/snippets/308/
    orderBy = cgi.escape(request.GET.get('orderby','relevance'))
    orderDesc = True if '1'==cgi.escape(request.GET.get('desc','1')) else False
    template_vars=pager(request,Pippies.find(filtr),orderBy,orderDesc)
    template_vars['pippies']=[{'id': pippi['_id'],
                               'pippi': ' '.join([p if p else '*' for p in pippi['pippi'].split(' ')]),
                               'docslen':len(pippi['docs']),
                               'relevance':pippi.get('relevance',0),}
                               for pippi in template_vars['data']]
    template_vars['doc']=docfilter
    if docfilter:
        doc=Docs.find_one({'_id': docfilter},['docid', 'title'])
        template_vars['docTitle']=doc['title'] if 'title' in doc else doc['docid']
    return render_to_response('pippies.html', template_vars, context_instance=RequestContext(request))
Beispiel #14
0
 def save(self):
     tmp=self.pippi
     self.pippi=" ".join(self.pippi)
     self.__dict__['_id']=Pippies.save(self.__dict__)
     self.pippi=tmp
Beispiel #15
0
 def save(self):
     self.__dict__['_id']=Pippies.save(self.__dict__)
Beispiel #16
0
 def save(self):
     tmp = self.pippi
     self.pippi = " ".join(self.pippi)
     self.__dict__['_id'] = Pippies.save(self.__dict__)
     self.pippi = tmp
Beispiel #17
0
 def save(self):
     self.__dict__['_id'] = Pippies.save(self.__dict__)