Example #1
0
 def reindex(self, dirty=None):
     # Determine indexable attributes and their weights.
     indexable = {}
     
     for base in reversed(type.mro(type(self))):
         _ = getattr(base, '_indexable', {})
         
         if isinstance(_, dict):
             indexable.update(_)
         
         elif isinstance(_, list):
             indexable.update(dict.fromkeys(_, 1.0))
     
     # Determine if we actually need to re-index or not.
     if dirty is not None and not set(indexable.keys()).intersection(set(dirty)):
         return
     
     index = DocumentIndex.objects(doc_id=self.id).first()
     if index: index.delete()
     
     # Determine the number of occurrences of each term with a per-attribute weight.
     occurrences = defaultdict(float)
     
     for attr, weight in indexable.iteritems():
         value = getattr(self, attr)
         
         if isinstance(value, basestring):
             for word in lexer.strip(value):
                 occurrences[word] += weight
         
         elif isinstance(value, (tuple, list, set)):
             for word in lexer.strip(u' '.join(value).encode('utf8')):
                 occurrences[word] += weight
     
     # Save the index and terms.
     index = DocumentIndex(doc_id=str(self.id), length=len(occurrences), terms=occurrences)
     index.save(safe=False)
Example #2
0
 def delete(self, safe=False):
     # Delete index data, if any.
     index = DocumentIndex.objects(id=self.id).first()
     if index: index.delete()
     
     # Depth-first cascading delete.
     for i in self.children:
         i.delete()
     
     # Remove reference to self from parent asset.
     if self.parent:
         self.parent.children.remove(self)
         self.parent.save()
     
     # Actually delete this asset.
     return super(Asset, self).delete(safe=safe)
Example #3
0
 def results(self, query=None):
     if query is None: query = self.query
     if query is None: return []
     
     terms = keywords(' '.join(strip(query.lower())))
     terms = (set(terms[0] + terms[1]), set(terms[2]))
     query = dict()
     aquery = dict()
     
     for term in list(terms[0]):
         if ':' in term:
             terms[0].remove(term)
             l, _, r = term.partition(':')
             
             if l == 'tag':
                 aquery.setdefault('tags', list()).append(r)
             
             elif l == 'kind':
                 aquery.setdefault('__raw__', dict())['_cls'] = {
                         '$regex' : r,
                         '$options': 'i'
                     }
     
     if not terms[0] and not terms[1]:
         def gen():
             for record in Asset.objects(**aquery).only('title', 'description', 'path', 'acl').order_by('created'):
                 yield 1.0, record
         return gen()
     
     for term in terms[0]:
         query['terms__%s__exists' % (term, )] = True
     
     for term in terms[1]:
         query['terms__%s__exists' % (term, )] = False
     
     # Calculate the inverse document frequency for each term
     idfs = {}
     num_docs = DocumentIndex.objects.count()
     
     for term in terms[0]:
         term_docs = DocumentIndex.objects(terms__term=term).count()
         idfs[term] = log_((num_docs - term_docs + 0.5) / (term_docs + 0.5))
     
     # Get the average document length.
     avg_doc_length = sum([i.length for i in DocumentIndex.objects.only('length')])/float(num_docs)
     
     k = 2.0
     b = 0.75
     f = []
     results = []
     
     def compute(idfs, idx, k, b, f):
         score = 0.0
         
         for term, q in idfs.iteritems():
             dividend = idx.terms[term] * (k + 1.0)
             relDocSize = idx.length / avg_doc_length
             divisor = q + ( 1.0 - b + b * relDocSize ) * k
             termScore = (dividend / divisor) * q
             score += termScore
         
         return (score, idx.doc_id)
     
     with futures.ThreadPoolExecutor(max_workers=5) as executor:
         for idx in DocumentIndex.objects(**query):
             f.append(executor.submit(compute, idfs, idx, k, b, f))
         
         for result in futures.as_completed(f):
             score, doc_id = result.result()
             results.append((score, doc_id))
     
     def iterresults():
         for score, id_ in results:
             yield score, Asset.objects(id=id_, **aquery).only('title', 'description', 'path', 'acl').first()
     
     return sorted(iterresults(), lambda a, b: cmp(a[0], b[0]), reverse=True)