def daemon(self): """ Daemon function Continuosly watches self.directory for changes and adds them to the index. """ ix = self.get_ix() writer = BufferedWriter(ix, limit=100) event_handler = IndexWriterEventHandler( writer, self, self.all, self.exclude, self.include, ) observer = Observer() observer.schedule(event_handler, path=self.directory, recursive=True) observer.start() #observer.should_keep_running() try: while True: time.sleep(1) except KeyboardInterrupt: observer.stop() writer.commit() ix.close() observer.join()
def bootstrap_index(dirname=None, indexname=None): """ Create spam index and add one post from the """ if dirname and indexname: ix = search.init_index(dirname=dirname, indexname=indexname, schema=spam_schema()) else: ix = init_spam_index() writer = BufferedWriter(ix) # Write text to index. index_writer(writer=writer, title="Placeholder", content_length=0, is_spam=True, content='CONTENT', uid=STARTER_UID) writer.commit() writer.close() return ix
def daemon(self): """ Daemon function Continuosly watches self.directory for changes and adds them to the index. """ ix = self.get_ix() writer = BufferedWriter(ix, limit=100) event_handler = IndexWriterEventHandler(writer, self, self.all, self.exclude, self.include, ) observer = Observer() observer.schedule(event_handler, path=self.directory, recursive=True) observer.start() #observer.should_keep_running() try: while True: time.sleep(1) except KeyboardInterrupt: observer.stop() writer.commit() ix.close() observer.join()
def test_classify(threshold=None, niter=100, limitmb=1024, size=100, verbosity=0): if threshold is None: threshold = settings.SPAM_THRESHOLD # Add posts to test spam index, then spam = Post.objects.filter(Q(spam=Post.SPAM) | Q(status=Post.DELETED)) # Get the valid posts and shuffle. ham = Post.objects.valid_posts(author__profile__score__lte=0, type__in=[Post.ANSWER, Post.COMMENT]) # Get list of id's for both spam = list(spam.values_list("id", flat=True)) ham = list(ham.values_list("id", flat=True)) # tp = Identify spam correctly. # tn = Identify valid post correctly. # fn = Missed to identify a spam. # fp = Mis-identified valid post as spam. tp, tn, fn, fp = 0, 0, 0, 0 seen_ham, seen_spam = 0, 0 elapsed, progress = util.timer_func() for i in range(niter): # Remove previous index if os.path.exists(TRAIN_DIR): shutil.rmtree(TRAIN_DIR) ix = search.init_index( dirname=TRAIN_DIR, indexname=f"train_{util.get_uuid(8)}_{settings.SPAM_INDEX_NAME}", schema=spam_schema()) writer = BufferedWriter(ix, limit=int((niter / 2) + 1), writerargs=dict(limitmb=limitmb, multisegment=True)) index_writer(writer=writer, title="Placeholder", content_length=0, is_spam=True, content='CONTENT', uid=STARTER_UID) # Take one spam post out of training set. one_out = one_out_train(spam=spam, writer=writer, size=size, ham=ham) writer.commit() writer.close() post_score = compute_score(post=one_out, ix=ix) predicted_spam = post_score >= threshold is_spam = one_out.is_spam or one_out.is_deleted is_ham = not is_spam seen_spam += 1 if is_spam else 0 seen_ham += 1 if is_ham else 0 detail(is_spam=is_spam, predict=predicted_spam, post=one_out, verb=verbosity, post_score=post_score) if predicted_spam: tp += 1 if is_spam else 0 fp += 1 if is_ham else 0 else: fn += 1 if is_spam else 0 tn += 1 if is_ham else 0 progress(i, step=5, msg=f"iterations. tp={tp} fp={fp} tn={tn} fn={fn}") train_spam = sizer(spam, size) train_ham = sizer(ham, size) print(f"... {train_ham + train_spam}\tSize of index ( per iteration )") print(f"... \t{train_spam}\tSPAM") print(f"... \t{train_ham}\tHAM") print(f"\n... {niter}\tNumber of iterations") report(nham=seen_ham, nspam=seen_spam, tn=tn, tp=tp, fp=fp, fn=fn) return
class Index(object): ''' Class to manage index readers and writers. ''' _source = None _target = {} _source_writer = None _target_writer = {} def source(self): ''' Returns source index. ''' if self._source is None: try: self._source = open_dir( appsettings.WHOOSH_INDEX, indexname='source' ) except whoosh.index.EmptyIndexError: self._source = create_source_index() except IOError: # eg. path does not exist self._source = create_source_index() return self._source def target(self, lang): ''' Returns target index for given language. ''' if not lang in self._target: try: self._target[lang] = open_dir( appsettings.WHOOSH_INDEX, indexname='target-%s' % lang ) except whoosh.index.EmptyIndexError: self._target[lang] = create_target_index(lang) return self._target[lang] def source_writer(self, buffered=True): ''' Returns source index writer (by default buffered). ''' if not buffered: return self.source().writer() if self._source_writer is None: self._source_writer = BufferedWriter(self.source()) return self._source_writer def target_writer(self, lang, buffered=True): ''' Returns target index writer (by default buffered) for given language. ''' if not buffered: return self.target(lang).writer() if not lang in self._target_writer: self._target_writer[lang] = BufferedWriter(self.target(lang)) return self._target_writer[lang] def source_searcher(self, buffered=True): ''' Returns source index searcher (on buffered writer). ''' if not buffered: return self.source().searcher() return self.source_writer(buffered).searcher() def target_searcher(self, lang, buffered=True): ''' Returns target index searcher (on buffered writer) for given language. ''' if not buffered: return self.target(lang).searcher() return self.target_writer(lang, buffered).searcher() def commit(self): ''' Commits pending changes. ''' self._source_writer.commit() for lang in self._target_writer: self._target_writer[lang].commit()