def __init__(self, rootdir, callback=dummy_progress_cb): """ Index files in rootdir (see constructor) Arguments: callback --- called during the indexation (may be called *often*). step : DocSearch.INDEX_STEP_READING or DocSearch.INDEX_STEP_SORTING progression : how many elements done yet total : number of elements to do document (only if step == DocSearch.INDEX_STEP_READING): file being read """ self.rootdir = rootdir base_indexdir = os.getenv("XDG_DATA_HOME", os.path.expanduser("~/.local/share")) self.indexdir = os.path.join(base_indexdir, "paperwork", "index") mkdir_p(self.indexdir) self.__docs_by_id = {} # docid --> doc self.label_list = [] need_index_rewrite = True try: logger.info("Opening index dir '%s' ..." % self.indexdir) self.index = whoosh.index.open_dir(self.indexdir) # check that the schema is up-to-date # We use the string representation of the schemas, because previous # versions of whoosh don't always implement __eq__ if str(self.index.schema) == str(self.WHOOSH_SCHEMA): need_index_rewrite = False except whoosh.index.EmptyIndexError, exc: logger.warning("Failed to open index '%s'" % self.indexdir) logger.warning("Exception was: %s" % str(exc))
def add_page(self, img, boxes): mkdir_p(self.path) page = ImgPage(self, self.nb_pages) page.img = img page.boxes = boxes self.drop_cache() return self.pages[-1]
def add_page(self, img, boxes): mkdir_p(self.path) logger.info("Adding page %d to %s" % (self.nb_pages, str(self))) page = ImgPage(self, self.nb_pages) page.img = img page.boxes = boxes self.drop_cache() return self.pages[-1]
def load(self, label_name, force_reload=False): label_hash = hex(abs(hash(label_name)))[2:] baye_dir = os.path.join(self._bayes_dir, label_hash) mkdir_p(baye_dir) if label_name not in self._bayes or force_reload: self._bayes[label_name] = simplebayes.SimpleBayes( cache_path=baye_dir ) self._bayes[label_name].cache_train()
def add_page(self, img, boxes): mkdir_p(self.path) logger.info("Adding page %d to %s", self.nb_pages, self) page = ImgPage(self, self.nb_pages) page.img = img page.boxes = boxes self.drop_cache() self._update_storage(1) return self.pages[-1]
def steal_page(self, page): """ Steal a page from another document """ if page.doc == self: return mkdir_p(self.path) new_page = ImgPage(self, self.nb_pages) logger.info("%s --> %s" % (str(page), str(new_page))) new_page._steal_content(page) page.doc.drop_cache() self.drop_cache()
def load(self, label_name, force_reload=False): label_hash = hex(abs(hash(label_name)))[2:] baye_dir = os.path.join(self._bayes_dir, label_hash) mkdir_p(baye_dir) if label_name not in self._bayes or force_reload: self._bayes[label_name] = simplebayes.SimpleBayes( cache_path=baye_dir) try: self._bayes[label_name].cache_train() except Exception: logger.exception( "Could not load cache " "for label '%s' from %s", label_name, self._bayes[label_name].get_cache_location())
def insert_page(self, img, boxes, page_nb): mkdir_p(self.path) logger.info("Inserting page %d to %s" % (page_nb, str(self))) if page_nb > self.nb_pages: page_nb = self.nb_pages # make a hole .. pages = self.pages for page_nb in range(self.nb_pages - 1, page_nb - 1, -1): page = pages[page_nb] page.change_index(offset=1) # .. and fill it page = ImgPage(self, page_nb) page.img = img page.boxes = boxes self.drop_cache() return self.pages[page_nb]
def check_workdir(self): """ Check that the current work dir (see config.PaperworkConfig) exists. If not, open the settings dialog. """ mkdir_p(self.rootdir)
def __init__(self, rootdir, indexdir=None, callback=dummy_progress_cb, label_store=None): """ Index files in rootdir (see constructor) Arguments: callback --- called during the indexation (may be called *often*). step : DocSearch.INDEX_STEP_READING or DocSearch.INDEX_STEP_SORTING progression : how many elements done yet total : number of elements to do document (only if step == DocSearch.INDEX_STEP_READING): file being read """ assert(label_store) self.label_store = label_store self.rootdir = rootdir if indexdir is None: base_data_dir = os.getenv( "XDG_DATA_HOME", os.path.expanduser("~/.local/share") ) indexdir = os.path.join(base_data_dir, "paperwork") self.indexdir = os.path.join(indexdir, "index") mkdir_p(self.indexdir) self.label_guesser_dir = os.path.join(indexdir, "label_guessing") mkdir_p(self.label_guesser_dir) self._docs_by_id = {} # docid --> doc self.labels = {} # label name --> label need_index_rewrite = True try: logger.info("Opening index dir '%s' ...", self.indexdir) self.index = whoosh.index.open_dir(self.indexdir) # check that the schema is up-to-date # We use the string representation of the schemas, because previous # versions of whoosh don't always implement __eq__ if str(self.index.schema) == str(self.WHOOSH_SCHEMA): need_index_rewrite = False except (whoosh.index.EmptyIndexError, ValueError) as exc: logger.warning("Failed to open index '%s'", self.indexdir) logger.warning("Exception was: %s", exc) if need_index_rewrite: logger.info("Creating a new index") self.index = whoosh.index.create_in(self.indexdir, self.WHOOSH_SCHEMA) logger.info("Index '%s' created", self.indexdir) self.__searcher = self.index.searcher() class CustomFuzzy(whoosh.qparser.query.FuzzyTerm): def __init__(self, fieldname, text, boost=1.0, maxdist=1, prefixlength=0, constantscore=True): whoosh.qparser.query.FuzzyTerm.__init__( self, fieldname, text, boost, maxdist, prefixlength, constantscore=True ) facets = [ whoosh.sorting.ScoreFacet(), whoosh.sorting.FieldFacet("date", reverse=True) ] self.search_param_list = { 'fuzzy': [ { "query_parser": whoosh.qparser.MultifieldParser( ["label", "content"], schema=self.index.schema, termclass=CustomFuzzy), "sortedby": facets }, { "query_parser": whoosh.qparser.MultifieldParser( ["label", "content"], schema=self.index.schema, termclass=whoosh.qparser.query.Prefix), "sortedby": facets }, ], 'strict': [ { "query_parser": whoosh.qparser.MultifieldParser( ["label", "content"], schema=self.index.schema, termclass=whoosh.query.Term), "sortedby": facets }, ], } self.label_guesser = LabelGuesser(self.label_guesser_dir) self.check_workdir() self.reload_index(callback)