def build_training_set_from_text(text, category, sender=None, subject=None): text = fix_bad_unicode(text) text = strip_signature(text, sender) features = extract_bigrams(text) training_set = [] training_set = training_set + [(get_feature(word), category) for word in features] training_set = training_set + [({sender: True}, category), ({subject: True}, category)] return training_set
def build_training_set(path='../email_dataset'): training_set = [] files = listdir(path) for email_file in files: with io.open('{}/{}'.format(path, email_file), 'r', encoding='utf8') as email: print u'Parsing file: {}'.format(email_file) category, sender, receiver, subject = int( email.readline().strip()), email.readline().strip(), email.readline().strip(), email.readline().strip() print u'Training set updated with: [{}]'.format(subject) text = fix_bad_unicode(email.read()) text = strip_signature(text, sender) features = extract_bigrams(text) training_set = training_set + [(get_feature(word), category) for word in features] training_set = training_set + [({sender: True}, category), ({subject: True}, category)] return training_set
def build_training_set(path="../email_dataset"): training_set = [] files = listdir(path) for email_file in files: with io.open("{}/{}".format(path, email_file), "r", encoding="utf8") as email: print u"Parsing file: {}".format(email_file) category, sender, receiver, subject = ( int(email.readline().strip()), email.readline().strip(), email.readline().strip(), email.readline().strip(), ) print u"Training set updated with: [{}]".format(subject) text = fix_bad_unicode(email.read()) text = strip_signature(text, sender) features = extract_bigrams(text) training_set = training_set + [(get_feature(word), category) for word in features] training_set = training_set + [({sender: True}, category), ({subject: True}, category)] return training_set