Beispiel #1
0
    def daemon(self):
        """
        Daemon function
        Continuosly watches self.directory for
        changes and adds them to the index.
        """
        ix = self.get_ix()
        writer = BufferedWriter(ix, limit=100)
        event_handler = IndexWriterEventHandler(
            writer,
            self,
            self.all,
            self.exclude,
            self.include,
        )
        observer = Observer()
        observer.schedule(event_handler, path=self.directory, recursive=True)
        observer.start()
        #observer.should_keep_running()

        try:
            while True:
                time.sleep(1)
        except KeyboardInterrupt:
            observer.stop()
            writer.commit()
            ix.close()
        observer.join()
def bootstrap_index(dirname=None, indexname=None):
    """
    Create spam index and add one post from the
    """
    if dirname and indexname:
        ix = search.init_index(dirname=dirname, indexname=indexname, schema=spam_schema())
    else:
        ix = init_spam_index()

    writer = BufferedWriter(ix)
    # Write text to index.
    index_writer(writer=writer, title="Placeholder",
                 content_length=0, is_spam=True,
                 content='CONTENT', uid=STARTER_UID)
    writer.commit()
    writer.close()

    return ix
Beispiel #3
0
    def daemon(self):
        """
        Daemon function
        Continuosly watches self.directory for
        changes and adds them to the index.
        """
        ix = self.get_ix()
        writer = BufferedWriter(ix, limit=100)
        event_handler = IndexWriterEventHandler(writer, self, self.all,
                                                self.exclude, self.include,
                                                )
        observer = Observer()
        observer.schedule(event_handler, path=self.directory, recursive=True)
        observer.start()
        #observer.should_keep_running()

        try:
            while True:
                time.sleep(1)
        except KeyboardInterrupt:
            observer.stop()
            writer.commit()
            ix.close()
        observer.join()
Beispiel #4
0
def test_classify(threshold=None,
                  niter=100,
                  limitmb=1024,
                  size=100,
                  verbosity=0):

    if threshold is None:
        threshold = settings.SPAM_THRESHOLD

    # Add posts to test spam index, then
    spam = Post.objects.filter(Q(spam=Post.SPAM) | Q(status=Post.DELETED))

    # Get the valid posts and shuffle.
    ham = Post.objects.valid_posts(author__profile__score__lte=0,
                                   type__in=[Post.ANSWER, Post.COMMENT])

    # Get list of id's for both
    spam = list(spam.values_list("id", flat=True))
    ham = list(ham.values_list("id", flat=True))

    # tp = Identify spam correctly.
    # tn = Identify valid post correctly.
    # fn = Missed to identify a spam.
    # fp = Mis-identified valid post as spam.
    tp, tn, fn, fp = 0, 0, 0, 0
    seen_ham, seen_spam = 0, 0
    elapsed, progress = util.timer_func()

    for i in range(niter):
        # Remove previous index
        if os.path.exists(TRAIN_DIR):
            shutil.rmtree(TRAIN_DIR)

        ix = search.init_index(
            dirname=TRAIN_DIR,
            indexname=f"train_{util.get_uuid(8)}_{settings.SPAM_INDEX_NAME}",
            schema=spam_schema())
        writer = BufferedWriter(ix,
                                limit=int((niter / 2) + 1),
                                writerargs=dict(limitmb=limitmb,
                                                multisegment=True))

        index_writer(writer=writer,
                     title="Placeholder",
                     content_length=0,
                     is_spam=True,
                     content='CONTENT',
                     uid=STARTER_UID)

        # Take one spam post out of training set.
        one_out = one_out_train(spam=spam, writer=writer, size=size, ham=ham)
        writer.commit()
        writer.close()
        post_score = compute_score(post=one_out, ix=ix)

        predicted_spam = post_score >= threshold
        is_spam = one_out.is_spam or one_out.is_deleted
        is_ham = not is_spam

        seen_spam += 1 if is_spam else 0
        seen_ham += 1 if is_ham else 0

        detail(is_spam=is_spam,
               predict=predicted_spam,
               post=one_out,
               verb=verbosity,
               post_score=post_score)

        if predicted_spam:
            tp += 1 if is_spam else 0
            fp += 1 if is_ham else 0

        else:
            fn += 1 if is_spam else 0
            tn += 1 if is_ham else 0

        progress(i, step=5, msg=f"iterations. tp={tp} fp={fp} tn={tn} fn={fn}")

    train_spam = sizer(spam, size)
    train_ham = sizer(ham, size)
    print(f"... {train_ham + train_spam}\tSize of index ( per iteration )")
    print(f"... \t{train_spam}\tSPAM")
    print(f"... \t{train_ham}\tHAM")
    print(f"\n... {niter}\tNumber of iterations")
    report(nham=seen_ham, nspam=seen_spam, tn=tn, tp=tp, fp=fp, fn=fn)

    return
Beispiel #5
0
class Index(object):
    '''
    Class to manage index readers and writers.
    '''

    _source = None
    _target = {}
    _source_writer = None
    _target_writer = {}

    def source(self):
        '''
        Returns source index.
        '''
        if self._source is None:
            try:
                self._source = open_dir(
                    appsettings.WHOOSH_INDEX,
                    indexname='source'
                )
            except whoosh.index.EmptyIndexError:
                self._source = create_source_index()
            except IOError:
                # eg. path does not exist
                self._source = create_source_index()
        return self._source

    def target(self, lang):
        '''
        Returns target index for given language.
        '''
        if not lang in self._target:
            try:
                self._target[lang] = open_dir(
                    appsettings.WHOOSH_INDEX,
                    indexname='target-%s' % lang
                )
            except whoosh.index.EmptyIndexError:
                self._target[lang] = create_target_index(lang)
        return self._target[lang]

    def source_writer(self, buffered=True):
        '''
        Returns source index writer (by default buffered).
        '''
        if not buffered:
            return self.source().writer()
        if self._source_writer is None:
            self._source_writer = BufferedWriter(self.source())
        return self._source_writer

    def target_writer(self, lang, buffered=True):
        '''
        Returns target index writer (by default buffered) for given language.
        '''
        if not buffered:
            return self.target(lang).writer()
        if not lang in self._target_writer:
            self._target_writer[lang] = BufferedWriter(self.target(lang))
        return self._target_writer[lang]

    def source_searcher(self, buffered=True):
        '''
        Returns source index searcher (on buffered writer).
        '''
        if not buffered:
            return self.source().searcher()
        return self.source_writer(buffered).searcher()

    def target_searcher(self, lang, buffered=True):
        '''
        Returns target index searcher (on buffered writer) for given language.
        '''
        if not buffered:
            return self.target(lang).searcher()
        return self.target_writer(lang, buffered).searcher()

    def commit(self):
        '''
        Commits pending changes.
        '''
        self._source_writer.commit()
        for lang in self._target_writer:
            self._target_writer[lang].commit()