Beispiel #1
0
def trainer(mgr, config, progress):
    rebuild = config.training.rebuild
    rescore = config.training.rescore

    if not config.training.ham_folder_ids or not config.training.spam_folder_ids:
        progress.error("You must specify at least one spam, and one good folder")
        return

    if rebuild:
        # Make a new temporary bayes database to use for training.
        # If we complete, then the manager "adopts" it.
        # This prevents cancelled training from leaving a "bad" db, and
        # also prevents mail coming in during training from being classified
        # with the partial database.
        import os, manager
        bayes_base = os.path.join(mgr.data_directory, "$sbtemp$default_bayes_database")
        mdb_base = os.path.join(mgr.data_directory, "$sbtemp$default_message_database")
        # determine which db manager to use, and create it.
        ManagerClass = manager.GetStorageManagerClass()
        db_manager = ManagerClass(bayes_base, mdb_base)
        classifier_data = manager.ClassifierData(db_manager, mgr)
        classifier_data.InitNew()
    else:
        classifier_data = mgr.classifier_data

    # We do this in possibly 3 stages - train, filter, save
    # re-scoring is much slower and training (as we actually have to save
    # the message back.)
    # Saving is really slow sometimes, but we only have 1 tick for that anyway
    if rescore:
        stages = ("Training", .3), ("Saving", .1), ("Scoring", .6)
    else:
        stages = ("Training", .9), ("Saving", .1)
    progress.set_stages(stages)

    real_trainer(classifier_data, config, mgr.message_store, progress)

    if progress.stop_requested():
        return

    if rebuild:
        assert mgr.classifier_data is not classifier_data
        mgr.classifier_data.Adopt(classifier_data)
        classifier_data = mgr.classifier_data

    progress.tick()

    if rescore:
        # Setup the "filter now" config to what we want.
        config = mgr.config.filter_now
        config.only_unread = False
        config.only_unseen = False
        config.action_all = False
        config.folder_ids = mgr.config.training.ham_folder_ids + mgr.config.training.spam_folder_ids
        config.include_sub = mgr.config.training.ham_include_sub or mgr.config.training.spam_include_sub
        import filter
        filter.filterer(mgr, mgr.config, progress)

    bayes = classifier_data.bayes
    progress.set_status("Completed training with %d spam and %d good messages" % (bayes.nspam, bayes.nham))
Beispiel #2
0
def WizardTrainer(mgr, config, progress):
    import os, manager, train
    bayes_base = os.path.join(mgr.data_directory,
                              "$sbwiz$default_bayes_database")
    mdb_base = os.path.join(mgr.data_directory,
                            "$sbwiz$default_message_database")
    fnames = []
    for ext in ".pck", ".db":
        fnames.append(bayes_base + ext)
        fnames.append(mdb_base + ext)
    config.wizard.temp_training_names = fnames
    # determine which db manager to use, and create it.
    ManagerClass = manager.GetStorageManagerClass()
    db_manager = ManagerClass(bayes_base, mdb_base)
    classifier_data = manager.ClassifierData(db_manager, mgr)
    classifier_data.InitNew()

    rescore = config.training.rescore

    if rescore:
        stages = (_("Training"), .3), (_("Saving"), .1), (_("Scoring"), .6)
    else:
        stages = (_("Training"), .9), (_("Saving"), .1)
    progress.set_stages(stages)

    train.real_trainer(classifier_data, config, mgr.message_store, progress)

    # xxx - more hacks - we should pass the classifier data in.
    orig_classifier_data = mgr.classifier_data
    mgr.classifier_data = classifier_data  # temporary
    try:
        progress.tick()

        if rescore:
            # Setup the "filter now" config to what we want.
            now_config = config.filter_now
            now_config.only_unread = False
            now_config.only_unseen = False
            now_config.action_all = False
            now_config.folder_ids = config.training.ham_folder_ids + \
                                    config.training.spam_folder_ids
            now_config.include_sub = config.training.ham_include_sub or \
                                     config.training.spam_include_sub
            import filter
            filter.filterer(mgr, config, progress)

        bayes = classifier_data.bayes
        progress.set_status(_("Completed training with %d spam and %d good messages") \
                            % (bayes.nspam, bayes.nham))
    finally:
        mgr.wizard_classifier_data = classifier_data
        mgr.classifier_data = orig_classifier_data
Beispiel #3
0
def WizardTrainer(mgr, config, progress):
    import os, manager, train
    bayes_base = os.path.join(mgr.data_directory, "$sbwiz$default_bayes_database")
    mdb_base = os.path.join(mgr.data_directory, "$sbwiz$default_message_database")
    fnames = []
    for ext in ".pck", ".db":
        fnames.append(bayes_base+ext)
        fnames.append(mdb_base+ext)
    config.wizard.temp_training_names = fnames
    # determine which db manager to use, and create it.
    ManagerClass = manager.GetStorageManagerClass()
    db_manager = ManagerClass(bayes_base, mdb_base)
    classifier_data = manager.ClassifierData(db_manager, mgr)
    classifier_data.InitNew()

    rescore = config.training.rescore

    if rescore:
        stages = ("Training", .3), ("Saving", .1), ("Scoring", .6)
    else:
        stages = ("Training", .9), ("Saving", .1)
    progress.set_stages(stages)

    train.real_trainer(classifier_data, config, mgr.message_store, progress)

    # xxx - more hacks - we should pass the classifier data in.
    orig_classifier_data = mgr.classifier_data
    mgr.classifier_data = classifier_data # temporary
    try:
        progress.tick()

        if rescore:
            # Setup the "filter now" config to what we want.
            now_config = config.filter_now
            now_config.only_unread = False
            now_config.only_unseen = False
            now_config.action_all = False
            now_config.folder_ids = config.training.ham_folder_ids + \
                                    config.training.spam_folder_ids
            now_config.include_sub = config.training.ham_include_sub or \
                                     config.training.spam_include_sub
            import filter
            filter.filterer(mgr, config, progress)

        bayes = classifier_data.bayes
        progress.set_status("Completed training with %d spam and %d good messages" \
                            % (bayes.nspam, bayes.nham))
    finally:
        mgr.wizard_classifier_data = classifier_data
        mgr.classifier_data = orig_classifier_data
Beispiel #4
0
def trainer(mgr, config, progress):
    rebuild = config.training.rebuild
    rescore = config.training.rescore
    if not config.training.ham_folder_ids and not config.training.spam_folder_ids:
        progress.error(_("You must specify at least one spam or one good folder"))
        return
    if rebuild:
        import os, manager

        bayes_base = os.path.join(mgr.data_directory, "$sbtemp$default_bayes_database")
        mdb_base = os.path.join(mgr.data_directory, "$sbtemp$default_message_database")
        ManagerClass = manager.GetStorageManagerClass()
        db_manager = ManagerClass(bayes_base, mdb_base)
        classifier_data = manager.ClassifierData(db_manager, mgr)
        classifier_data.InitNew()
    else:
        classifier_data = mgr.classifier_data
    if rescore:
        stages = (_("Training"), 0.3), (_("Saving"), 0.1), (_("Scoring"), 0.6)
    else:
        stages = (_("Training"), 0.9), (_("Saving"), 0.1)
    progress.set_stages(stages)
    real_trainer(classifier_data, config, mgr.message_store, progress)
    if progress.stop_requested():
        return
    if rebuild:
        assert mgr.classifier_data is not classifier_data
        mgr.AdoptClassifierData(classifier_data)
        classifier_data = mgr.classifier_data
        mgr.LogDebug(1, "Session:" + "\r\n".join(mgr.stats.GetStats(session_only=True)))
        mgr.LogDebug(1, "Total:" + "\r\n".join(mgr.stats.GetStats()))
        mgr.stats.Reset()
        mgr.stats.ResetTotal(permanently=True)
    progress.tick()
    if rescore:
        config = mgr.config.filter_now
        config.only_unread = False
        config.only_unseen = False
        config.action_all = False
        config.folder_ids = mgr.config.training.ham_folder_ids + mgr.config.training.spam_folder_ids
        config.include_sub = mgr.config.training.ham_include_sub or mgr.config.training.spam_include_sub
        import filter

        filter.filterer(mgr, mgr.config, progress)
    bayes = classifier_data.bayes
    progress.set_status(_("Completed training with %d spam and %d good messages") % (bayes.nspam, bayes.nham))
Beispiel #5
0
def WizardTrainer(mgr, config, progress):
    import os, manager, train

    bayes_base = os.path.join(mgr.data_directory, "$sbwiz$default_bayes_database")
    mdb_base = os.path.join(mgr.data_directory, "$sbwiz$default_message_database")
    fnames = []
    for ext in ".pck", ".db":
        fnames.append(bayes_base + ext)
        fnames.append(mdb_base + ext)
    config.wizard.temp_training_names = fnames
    ManagerClass = manager.GetStorageManagerClass()
    db_manager = ManagerClass(bayes_base, mdb_base)
    classifier_data = manager.ClassifierData(db_manager, mgr)
    classifier_data.InitNew()
    rescore = config.training.rescore
    if rescore:
        stages = (_("Training"), 0.3), (_("Saving"), 0.1), (_("Scoring"), 0.6)
    else:
        stages = (_("Training"), 0.9), (_("Saving"), 0.1)
    progress.set_stages(stages)
    train.real_trainer(classifier_data, config, mgr.message_store, progress)
    orig_classifier_data = mgr.classifier_data
    mgr.classifier_data = classifier_data  # temporary
    try:
        progress.tick()
        if rescore:
            now_config = config.filter_now
            now_config.only_unread = False
            now_config.only_unseen = False
            now_config.action_all = False
            now_config.folder_ids = config.training.ham_folder_ids + config.training.spam_folder_ids
            now_config.include_sub = config.training.ham_include_sub or config.training.spam_include_sub
            import filter

            filter.filterer(mgr, config, progress)
        bayes = classifier_data.bayes
        progress.set_status(_("Completed training with %d spam and %d good messages") % (bayes.nspam, bayes.nham))
    finally:
        mgr.wizard_classifier_data = classifier_data
        mgr.classifier_data = orig_classifier_data
Beispiel #6
0
def trainer(mgr, config, progress):
    rebuild = config.training.rebuild
    rescore = config.training.rescore

    if not config.training.ham_folder_ids and not config.training.spam_folder_ids:
        progress.error(
            _("You must specify at least one spam or one good folder"))
        return

    if rebuild:
        # Make a new temporary bayes database to use for training.
        # If we complete, then the manager "adopts" it.
        # This prevents cancelled training from leaving a "bad" db, and
        # also prevents mail coming in during training from being classified
        # with the partial database.
        import os, manager
        bayes_base = os.path.join(mgr.data_directory,
                                  "$sbtemp$default_bayes_database")
        mdb_base = os.path.join(mgr.data_directory,
                                "$sbtemp$default_message_database")
        # determine which db manager to use, and create it.
        ManagerClass = manager.GetStorageManagerClass()
        db_manager = ManagerClass(bayes_base, mdb_base)
        classifier_data = manager.ClassifierData(db_manager, mgr)
        classifier_data.InitNew()
    else:
        classifier_data = mgr.classifier_data

    # We do this in possibly 3 stages - train, filter, save
    # re-scoring is much slower than training (as we actually have to save
    # the message back.)
    # Saving is really slow sometimes, but we only have 1 tick for that anyway
    if rescore:
        stages = (_("Training"), .3), (_("Saving"), .1), (_("Scoring"), .6)
    else:
        stages = (_("Training"), .9), (_("Saving"), .1)
    progress.set_stages(stages)

    real_trainer(classifier_data, config, mgr.message_store, progress)

    if progress.stop_requested():
        return

    if rebuild:
        assert mgr.classifier_data is not classifier_data
        mgr.AdoptClassifierData(classifier_data)
        classifier_data = mgr.classifier_data
        # If we are rebuilding, then we reset the statistics, too.
        # (But output them to the log for reference).
        mgr.LogDebug(1, "Session:" + "\r\n".join(\
            mgr.stats.GetStats(session_only=True)))
        mgr.LogDebug(1, "Total:" + "\r\n".join(mgr.stats.GetStats()))
        mgr.stats.Reset()
        mgr.stats.ResetTotal(permanently=True)

    progress.tick()

    if rescore:
        # Setup the "filter now" config to what we want.
        config = mgr.config.filter_now
        config.only_unread = False
        config.only_unseen = False
        config.action_all = False
        config.folder_ids = mgr.config.training.ham_folder_ids + mgr.config.training.spam_folder_ids
        config.include_sub = mgr.config.training.ham_include_sub or mgr.config.training.spam_include_sub
        import filter
        filter.filterer(mgr, mgr.config, progress)

    bayes = classifier_data.bayes
    progress.set_status(
        _("Completed training with %d spam and %d good messages") %
        (bayes.nspam, bayes.nham))