Ejemplo n.º 1
0
all_strings = []
labels_array = []
feature_array = []

rep_reply = re.compile('#reply#')
rep_sig = re.compile('#sig# ')
# open all strings from labeled database and get labels
for filename in os.listdir('traindataset'):
    with open('traindataset/'+filename) as f:
        lines = f.readlines()
        sender = extract_sender_name(lines)
        for l in lines:
            # Remove labels from message
            l = re.sub(rep_reply, '', l)
            if re.search(rep_sig, l):
                labels_array.append(1)
            else:
                labels_array.append(0)
            l = re.sub(rep_sig, '', l)
            all_strings.append(l)
            # get feature vectors for creating model
            feature_array.append(get_features_vector(l, sender))

# Initialize Support Vector Machine model from scikit-learn module
clf = SVC()
clf.fit(feature_array, labels_array)

# Save fitted model
joblib.dump(clf, 'trained_model.pkl', compress=9)
Ejemplo n.º 2
0

__author__ = 'Zakovryashin Alexey'

SVM_model = joblib.load('trained_model.pkl')
clf = SVM_model

# Regular expressions for email, phone, URL, and name like Bob W. Smith
re_mas = [
    re.compile('\S@\S'),
    re.compile('\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}'),
    re.compile('https?://|www\.[\S]+\.[\S]'),
    re.compile('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+')
    ]

personal_data = []
foldername = raw_input('Input folder with messages: ')
# Open all message files from folder
for filename in os.listdir(foldername):
    with open(foldername+'/'+filename) as f:
        lines = f.readlines()
        sender = extract_sender_name(lines)
        for l in lines:
            # If line from message classify as signature
            if clf.predict(get_features_vector(l, sender)) == 1:
                # Try extract all personal data from signature line with use regular expressions
                for regex in re_mas:
                    if regex.match(l):
                        print regex.match(l).group()
                        personal_data.append(regex.match(l).group())