def test_apply_features(): s = """This is John Doe Tuesday @3pm suits. I'll chat to you then. VP Research and Development, Xxxx Xxxx Xxxxx 555-226-2345 [email protected]""" sender = "John <*****@*****.**>" features = fs.features(sender) result = fs.apply_features(s, features) # note that we don't consider the first line because signatures don't # usually take all the text, empty lines are not considered eq_( result, [ [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], ], ) with patch.object(fs, "SIGNATURE_MAX_LINES", 5): features = fs.features(sender) new_result = fs.apply_features(s, features) # result remains the same because we don't consider empty lines eq_(result, new_result)
def test_apply_features(): s = '''This is John Doe Tuesday @3pm suits. I'll chat to you then. VP Research and Development, Xxxx Xxxx Xxxxx 555-226-2345 [email protected]''' sender = 'John <*****@*****.**>' features = fs.features(sender) result = fs.apply_features(s, features) # note that we don't consider the first line because signatures don't # usually take all the text, empty lines are not considered eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) with patch.object(fs, 'SIGNATURE_MAX_LINES', 5): features = fs.features(sender) new_result = fs.apply_features(s, features) # result remains the same because we don't consider empty lines eq_(result, new_result)
def build_extraction_dataset(folder, dataset_filename, sender_known=True): """Builds signature extraction dataset using emails in the `folder`. The emails in the `folder` should be annotated i.e. signature lines should be marked with `#sig#`. """ if os.path.exists(dataset_filename): os.remove(dataset_filename) with open(dataset_filename, 'a') as dataset: for filename in os.listdir(folder): filename = os.path.join(folder, filename) sender, msg = parse_msg_sender(filename, sender_known) if not sender or not msg: continue lines = msg.splitlines() for i in range(1, min(SIGNATURE_MAX_LINES, len(lines)) + 1): line = lines[-i] label = -1 if line[:len(SIGNATURE_ANNOTATION)] == \ SIGNATURE_ANNOTATION: label = 1 line = line[len(SIGNATURE_ANNOTATION):] elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION: line = line[len(REPLY_ANNOTATION):] X = build_pattern(line, features(sender)) X.append(label) labeled_pattern = ','.join([str(e) for e in X]) dataset.write(labeled_pattern + '\n')
def test_build_pattern(): s = '''John Doe VP Research and Development, Xxxx Xxxx Xxxxx 555-226-2345 [email protected]''' sender = 'John <*****@*****.**>' features = fs.features(sender) result = fs.build_pattern(s, features) eq_(result, [2, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1])
def test_build_extraction_dataset(): if os.path.exists(os.path.join(TMP_DIR, 'extraction.data')): os.remove(os.path.join(TMP_DIR, 'extraction.data')) d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'), os.path.join(TMP_DIR, 'extraction.data'), 1) test_data = SparseDataSet(os.path.join(TMP_DIR, 'extraction.data'), labelsColumn=-1) # the result is a loadable signature extraction dataset # 32 comes from 3 emails in emails/P folder, 11 lines checked to be # a signature, one email has only 10 lines eq_(test_data.size(), 32) eq_(len(features('')), test_data.numFeatures)
def test_apply_features(): s = '''John Doe VP Research and Development, Xxxx Xxxx Xxxxx 555-226-2345 [email protected]''' sender = 'John <*****@*****.**>' features = fs.features(sender) result = fs.apply_features(s, features) # note that we don't consider the first line because signatures don't # usually take all the text, empty lines are not considered eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) with patch.object(fs, 'SIGNATURE_MAX_LINES', 4): features = fs.features(sender) new_result = fs.apply_features(s, features) # result remains the same because we don't consider empty lines eq_(result, new_result)
def test_build_extraction_dataset(): if os.path.exists(os.path.join(TMP_DIR, "extraction.data")): os.remove(os.path.join(TMP_DIR, "extraction.data")) d.build_extraction_dataset(os.path.join(EMAILS_DIR, "P"), os.path.join(TMP_DIR, "extraction.data"), 1) filename = os.path.join(TMP_DIR, "extraction.data") file_data = genfromtxt(filename, delimiter=",") test_data = file_data[:, :-1] # the result is a loadable signature extraction dataset # 32 comes from 3 emails in emails/P folder, 11 lines checked to be # a signature, one email has only 10 lines eq_(test_data.shape[0], 32) eq_(len(features("")), test_data.shape[1])
def test_build_extraction_dataset(): if os.path.exists(os.path.join(TMP_DIR, 'extraction.data')): os.remove(os.path.join(TMP_DIR, 'extraction.data')) d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'), os.path.join(TMP_DIR, 'extraction.data'), 1) filename = os.path.join(TMP_DIR, 'extraction.data') file_data = genfromtxt(filename, delimiter=",") test_data = file_data[:, :-1] # the result is a loadable signature extraction dataset # 32 comes from 3 emails in emails/P folder, 11 lines checked to be # a signature, one email has only 10 lines eq_(test_data.shape[0], 32) eq_(len(features('')), test_data.shape[1])
def build_extraction_dataset(repetition, source_folder, emails, dataset_filename, sender_known=True): """Builds signature extraction dataset using emails in the `folder` . The emails in the `folder` should be annotated i.e. signature lines should be marked with `#sig#`. """ global EXTRACTOR_DATA dataset_filename = dataset_filename + repetition if os.path.exists(dataset_filename): os.remove(dataset_filename) with open(dataset_filename, 'a') as dataset: for email in emails: filename = source_folder + email sender, msg = parse_msg_sender(filename, sender_known) if not sender or not msg: #print 'Empty: ' + filename continue ### Use 2 lines below to save the marked signature part into '*_result' file. ## result_filename = build_result_filename(filename) if os.path.exists(result_filename): os.remove(result_filename) with open(result_filename, 'a') as result: ## indent below after comment is taken off lines = msg.splitlines() for i in xrange(1, min(SIGNATURE_MAX_LINES, len(lines)) + 1): line = lines[-i] label = -1 if line[:len(SIGNATURE_ANNOTATION)] == \ SIGNATURE_ANNOTATION: label = 1 line = line[len(SIGNATURE_ANNOTATION):] # ## # result.write(line + '\n') # ## elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION: line = line[len(REPLY_ANNOTATION):] X = build_pattern(line, features(sender)) X.append(label) labeled_pattern = ','.join([str(e) for e in X]) dataset.write(labeled_pattern + '\n') return dataset_filename
def build_extraction_dataset(folder, dataset_filename, sender_known=True): """Builds signature extraction dataset using emails in the `folder`. The emails in the `folder` should be annotated i.e. signature lines should be marked with `#sig#`. """ if os.path.exists(dataset_filename): os.remove(dataset_filename) with open(dataset_filename, 'a') as dataset: for filename in os.listdir(folder): #print filename filename = os.path.join(folder, filename) sender, msg = parse_msg_sender(filename, sender_known) if not sender or not msg: #print 'Empty: ' + filename continue ## use 2 lines below to pre-process emails to get the body and sender file for later Email Extraction. # msg = process(msg,filename,sender) # continue # ### Use 2 lines below to save the marked signature part into '*_result' file. # ## # result_filename = build_result_filename(filename) # if os.path.exists(result_filename): # os.remove(result_filename) # with open(result_filename, 'a') as result: # ## indent below after comment is taken off lines = msg.splitlines() for i in xrange(1, min(SIGNATURE_MAX_LINES, len(lines)) + 1): line = lines[-i] label = -1 if line[:len(SIGNATURE_ANNOTATION)] == \ SIGNATURE_ANNOTATION: label = 1 line = line[len(SIGNATURE_ANNOTATION):] # ## # result.write(line + '\n') # ## elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION: line = line[len(REPLY_ANNOTATION):] X = build_pattern(line, features(sender)) X.append(label) labeled_pattern = ','.join([str(e) for e in X]) dataset.write(labeled_pattern + '\n')
def extract_training_vectors(emails, csv_file, output_extraction_file): with open(output_extraction_file, 'w') as dataset: with open(csv_file, 'r') as csvinput: reader = csv.DictReader(csvinput) for row in reader: if row['filename'] not in emails: continue else: sender = row['sender'] lines = set(row['origin'].splitlines()) sigs = set(row['sig'].splitlines()) n_sigs = lines - sigs for line in lines: label = 0 if line in sigs: label = 1 X = build_pattern(line, features(sender)) X.append(label) labeled_pattern = ','.join([str(e) for e in X]) dataset.write(labeled_pattern + '\n')
def build_detection_class(folder, dataset_filename, label, sender_known=True): """Builds signature detection class. Signature detection dataset includes patterns for two classes: * class for positive patterns (goes with label 1) * class for negative patterns (goes with label -1) The patterns are build of emails from `folder` and appended to dataset file. >>> build_signature_detection_class('emails/P', 'train.data', 1) """ with open(dataset_filename, 'a') as dataset: for filename in os.listdir(folder): filename = os.path.join(folder, filename) sender, msg = parse_msg_sender(filename, sender_known) if sender is None or msg is None: continue msg = re.sub('|'.join(ANNOTATIONS), '', msg) X = build_pattern(msg, features(sender)) X.append(label) labeled_pattern = ','.join([str(e) for e in X]) dataset.write(labeled_pattern + '\n')
def is_signature_line(line, sender, classifier): '''Checks if the line belongs to signature. Returns True or False.''' data = SparseDataSet([build_pattern(line, features(sender))]) return classifier.decisionFunc(data, 0) > 0
def is_signature_line(line, sender, classifier): '''Checks if the line belongs to signature. Returns True or False.''' data = numpy.array(build_pattern(line, features(sender))) return classifier.predict(data) > 0