joblib.dump(DTBody, 'Saved_Model/Body/Decision_Tree_Model_Body') if __name__ == '__main__': numpy.set_printoptions(threshold=sys.maxsize) # The first part is to create the dataset data = Extract_features.Create_the_dataSet() y = data['label'] # We get the important word from the headers WordsToVector.getTheWordHeader(data) # We create a list with email object # each email contains the header, body and the label list_emails = Email.List_of_emails(data) # Split the dataset for 80% train and 20% test X_train, X_test, y_train, y_test = train_test_split(list_emails, y, train_size=0.8) # make a list of all the header of the emails email_headers_train = [x.header for x in X_train if not pd.isnull(x.header)] email_headers_test = [x.header for x in X_test if not pd.isnull(x.header)] # Two way to work on text: 1) CountVector 2)TfIdf CountTrain, CountTest, vector = WordsToVector.CountVector(email_headers_train, email_headers_test) CountTrainTf, CountTestTf, vectorTf = WordsToVector.TfIdf(email_headers_train, email_headers_test) y_train = y_train.fillna("0") y_test = y_test.fillna("0")