Example #1
0
        joblib.dump(DTBody, 'Saved_Model/Body/Decision_Tree_Model_Body')


if __name__ == '__main__':
    numpy.set_printoptions(threshold=sys.maxsize)
    # The first  part is to create the dataset
    data = Extract_features.Create_the_dataSet()

    y = data['label']

    # We get the important word from the headers
    WordsToVector.getTheWordHeader(data)

    # We create a list with email object
    # each email contains the header, body and the label
    list_emails = Email.List_of_emails(data)

    # Split the dataset for 80% train and 20% test
    X_train, X_test, y_train, y_test = train_test_split(list_emails, y, train_size=0.8)

    # make a list of all the header of the emails
    email_headers_train = [x.header for x in X_train if not pd.isnull(x.header)]
    email_headers_test = [x.header for x in X_test if not pd.isnull(x.header)]

    # Two way to work on text: 1) CountVector 2)TfIdf
    CountTrain, CountTest, vector = WordsToVector.CountVector(email_headers_train, email_headers_test)
    CountTrainTf, CountTestTf, vectorTf = WordsToVector.TfIdf(email_headers_train, email_headers_test)

    y_train = y_train.fillna("0")
    y_test = y_test.fillna("0")