def train_dir(directory, cls, bool_model, bool_feat, ntf_model, ntf_feat): for f in NBmodel.get_files(directory): bool_ex = NBmodel.munge_Boolean(f, bool_feat) ntf_ex = NBmodel.munge_NTF(f, ntf_feat) bool_model.train(bool_ex, cls) ntf_model.train(ntf_ex, cls)
def _get_features_list(spam_dir, ham_dir): def process_email(freq, filename, cls): email = open(filename, "r") for line in email: tok = line.rstrip("\n").rstrip("\r").strip(" ").split(" ") for t in tok: freq[(cls, t)] = [freq[(cls, t)][0] + 1, freq[(cls, t)][0] + float(1 / len(tok))] email.close() freq = defaultdict(lambda: list([0, 0])) for email_file in NBmodel.get_files(spam_dir): process_email(freq, email_file, "SPAM") for email_file in NBmodel.get_files(ham_dir): process_email(freq, email_file, "HAM") return freq
def train_dir(directory, cls, model): exs = [] for f in NBmodel.get_files(directory): exs.append(model.munge(f)) if len(exs) > 500: for ex in exs: model.train(ex, cls) exs = []
def train_dir(directory, cls, model): exs = [] for f in NBmodel.get_files(directory): exs.append(model.munge(f)) if len(exs) > 500: for ex in exs: model.train(ex, cls) exs = []
def _get_features_list(spam_dir, ham_dir): def process_email(freq, filename, cls): email = open(filename, 'r') for line in email: tok = line.rstrip("\n").rstrip("\r").strip(" ").split(" ") for t in tok: freq[(cls,t)] = [freq[(cls,t)][0]+1, freq[(cls,t)][0]+float(1/len(tok))] email.close() freq = defaultdict(lambda: list([0,0])) for email_file in NBmodel.get_files(spam_dir): process_email(freq, email_file, 'SPAM') for email_file in NBmodel.get_files(ham_dir): process_email(freq, email_file, 'HAM') return freq