all_strings = [] labels_array = [] feature_array = [] rep_reply = re.compile('#reply#') rep_sig = re.compile('#sig# ') # open all strings from labeled database and get labels for filename in os.listdir('traindataset'): with open('traindataset/'+filename) as f: lines = f.readlines() sender = extract_sender_name(lines) for l in lines: # Remove labels from message l = re.sub(rep_reply, '', l) if re.search(rep_sig, l): labels_array.append(1) else: labels_array.append(0) l = re.sub(rep_sig, '', l) all_strings.append(l) # get feature vectors for creating model feature_array.append(get_features_vector(l, sender)) # Initialize Support Vector Machine model from scikit-learn module clf = SVC() clf.fit(feature_array, labels_array) # Save fitted model joblib.dump(clf, 'trained_model.pkl', compress=9)
__author__ = 'Zakovryashin Alexey' SVM_model = joblib.load('trained_model.pkl') clf = SVM_model # Regular expressions for email, phone, URL, and name like Bob W. Smith re_mas = [ re.compile('\S@\S'), re.compile('\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}'), re.compile('https?://|www\.[\S]+\.[\S]'), re.compile('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+') ] personal_data = [] foldername = raw_input('Input folder with messages: ') # Open all message files from folder for filename in os.listdir(foldername): with open(foldername+'/'+filename) as f: lines = f.readlines() sender = extract_sender_name(lines) for l in lines: # If line from message classify as signature if clf.predict(get_features_vector(l, sender)) == 1: # Try extract all personal data from signature line with use regular expressions for regex in re_mas: if regex.match(l): print regex.match(l).group() personal_data.append(regex.match(l).group())