Beispiel #1
0
    def _update_doc_in_index(self, index_writer, doc):
        """
        Add/Update a document in the index
        """
        all_labels = set(self.docsearch.label_list)
        doc_labels = set(doc.labels)
        new_labels = doc_labels.difference(all_labels)

        # can happen when we recreate the index from scract
        for label in new_labels:
            self.docsearch.create_label(label)

        last_mod = datetime.datetime.fromtimestamp(doc.last_mod)
        docid = unicode(doc.docid)

        dochash = doc.get_docfilehash()
        dochash = (u"%X" % dochash)

        doc_txt = doc.get_index_text()
        assert (isinstance(doc_txt, unicode))
        labels_txt = doc.get_index_labels()
        assert (isinstance(labels_txt, unicode))

        query = whoosh.query.Term("docid", docid)
        index_writer.delete_by_query(query)

        index_writer.update_document(
            docid=docid,
            doctype=doc.doctype,
            docfilehash=dochash,
            content=strip_accents(doc.get_index_text()),
            label=strip_accents(doc.get_index_labels()),
            date=doc.date,
            last_read=last_mod)
        return True
Beispiel #2
0
    def _update_doc_in_index(self,
                             index_writer,
                             doc,
                             fit_label_estimator=True):
        """
        Add/Update a document in the index
        """
        all_labels = set(self.docsearch.label_list)
        doc_labels = set(doc.labels)
        new_labels = doc_labels.difference(all_labels)

        if new_labels != set():
            for label in new_labels:
                self.docsearch.label_list += [label]
            self.docsearch.label_list.sort()
            if fit_label_estimator:
                self.docsearch.fit_label_estimator(labels=new_labels)

        if fit_label_estimator:
            self.docsearch.fit_label_estimator([doc])
        last_mod = datetime.datetime.fromtimestamp(doc.last_mod)
        docid = unicode(doc.docid)

        dochash = doc.get_docfilehash()
        dochash = (u"%X" % dochash)

        index_writer.update_document(
            docid=docid,
            doctype=doc.doctype,
            docfilehash=dochash,
            content=strip_accents(doc.get_index_text()),
            label=strip_accents(doc.get_index_labels()),
            date=doc.date,
            last_read=last_mod)
        return True
Beispiel #3
0
 def __label_cmp(self, other):
     """
     Comparaison function. Can be used to sort labels alphabetically.
     """
     if other is None:
         return -1
     label_name = strip_accents(self.name).lower()
     other_name = strip_accents(other.name).lower()
     cmp_r = cmp(label_name, other_name)
     if cmp_r != 0:
         return cmp_r
     return cmp(self.get_color_str(), other.get_color_str())
Beispiel #4
0
    def __label_cmp(self, other, text_only=False):
        """
        Comparaison function. Can be used to sort labels alphabetically.

        Labels are deemed equal if they have the same (or similar) text,
        regardless of color.
        """
        if other is None:
            return -1
        label_name = strip_accents(self.name).lower()
        other_name = strip_accents(other.name).lower()
        cmp_r = cmp(label_name, other_name)
        if cmp_r != 0 or text_only:
            return cmp_r
        return cmp(self.get_color_str(), other.get_color_str())
Beispiel #5
0
    def find_documents(self,
                       sentence,
                       limit=None,
                       must_sort=True,
                       search_type='fuzzy'):
        """
        Returns all the documents matching the given keywords

        Arguments:
            sentence --- a sentenced query
        Returns:
            An array of document (doc objects)
        """
        sentence = sentence.strip()
        sentence = strip_accents(sentence)

        if sentence == u"":
            return self.docs

        result_list_list = []
        total_results = 0

        for query_parser in self.search_param_list[search_type]:
            query = query_parser["query_parser"].parse(sentence)
            if must_sort and "sortedby" in query_parser:
                result_list = self.__searcher.search(
                    query, limit=limit, sortedby=query_parser["sortedby"])
            else:
                result_list = self.__searcher.search(query, limit=limit)

            result_list_list.append(result_list)
            total_results += len(result_list)

            if not must_sort and total_results >= limit:
                break

        # merging results
        results = result_list_list[0]
        for result_intermediate in result_list_list[1:]:
            results.extend(result_intermediate)

        docs = [self._docs_by_id.get(result['docid']) for result in results]
        try:
            while True:
                docs.remove(None)
        except ValueError:
            pass
        assert (None not in docs)

        if limit is not None:
            docs = docs[:limit]

        return docs
Beispiel #6
0
 def get_index_text(self):
     txt = u""
     for page in self.pages:
         txt += u"\n".join([unicode(line) for line in page.text])
     extra_txt = self.extra_text
     if extra_txt != u"":
         txt += extra_txt + u"\n"
     txt = txt.strip()
     txt = strip_accents(txt)
     if txt == u"":
         # make sure the text field is not empty. Whoosh doesn't like that
         txt = u"empty"
     return txt
Beispiel #7
0
 def get_index_labels(self):
     return u",".join([strip_accents(unicode(label.name))
                       for label in self.labels])