Beispiel #1
0
def featureEngineeringTraining():
    # ----------------- READ PREPROCESSING FILE -----------------------
    DATA_FOLDER = "ml_core/data/training/"
    VECT_FOLDER = "ml_core/vector/training/"
    FEATURES_FOLDER = "ml_core/vector/training/features/"

    FileName = "Preprocessed_Dataset_Training.csv"

    TWEET_DATA = pd.read_csv(DATA_FOLDER + FileName,
                             usecols=["tweet_tokens_stemmed"])
    TWEET_DATA.columns = ["tweet"]

    # join list of token as single document string
    def join_text_list(texts):
        texts = ast.literal_eval(texts)
        return ' '.join([text for text in texts])

    TWEET_DATA["tweet_join"] = TWEET_DATA["tweet"].apply(join_text_list)

    #------------------------- READ CONFIG ---------------------------
    ses_max_feature = readJson_config('ml_core/', 'configuration.json',
                                      'max_features')
    max_features = int(
        ses_max_feature[0]) if ses_max_feature is not None else 1000

    # ------------------------- MAIN CALC ----------------------------
    # ngram_range (1, 3) to use unigram, bigram, trigram
    cvect = CountVectorizer(max_features=max_features, ngram_range=(1, 1))
    counts = cvect.fit_transform(TWEET_DATA["tweet_join"])

    normalized_counts = normalize(counts, norm='l1', axis=1)

    tfidf = TfidfVectorizer(max_features=max_features,
                            ngram_range=(1, 1),
                            smooth_idf=False)
    tfs = tfidf.fit_transform(TWEET_DATA["tweet_join"])
    tfidf_sparse = normalized_counts.multiply(tfidf.idf_)

    feature_name = {}
    feature_name['feature'] = tfidf.get_feature_names()

    tfidf_mat = tfidf_sparse.toarray()

    #------------------------ SAVE -----------------------------------
    tfidf_sparse = sparse.csr_matrix(tfidf_sparse)

    sparse.save_npz(VECT_FOLDER + "tfidf_sparse_training.npz", tfidf_sparse)
    writeJson_config(FEATURES_FOLDER, ("tfidf_feature_training.json"),
                     feature_name,
                     append=False)

    return 'success'
Beispiel #2
0
def classificationTraining(cnn_model, tfidf_mat_selection, tags):
    MODEL_FOLDER = "ml_core/model/"

    # split dataset 
    tfidf_mat_train, tfidf_mat_test, tags_train, tags_test = \
            train_test_split(tfidf_mat_selection, tags, test_size=0.25, random_state=42)



    def check_model(model,x,y): 
        #return model.fit(x,y,batch_size=63,epochs=20,verbose=1,validation_split=0.15)
        return model.fit(x, y, verbose=1, validation_split=0.2)

    # ----------------------------- START TRAINING MODEL ------------------------------
    estimator = KerasClassifier(build_fn=cnn_model, epochs=25, batch_size=6)
    
    history = check_model(estimator,tfidf_mat_train, tags_train.ravel())

    # ------------------------------ TEST REPORT --------------------------------
    tags_pred = estimator.predict(tfidf_mat_test)
    
    def print_cm(y_true, y_pred, labels_order) :
        df = pd.DataFrame(
                    confusion_matrix(y_true, y_pred, labels=labels_order), 
                    index=['target : 1', 'target : 0'], 
                    columns=['pred : 1', 'pred : 0']
                )
        df.style.set_properties(**{'text-align': 'center'})
        return df

    cm_model = print_cm(tags_pred, tags_test, [1, 0])
    report_model = classification_report(tags_pred, tags_test, output_dict=True)

    print(cm_model)
    print(report_model)
    # ------------------------------ SAVE MODEL & HISTORY ------------------------------    
    
    estimator.model.save(MODEL_FOLDER + "cnn_model_training.h5")

    pickle.dump(estimator.classes_, open(MODEL_FOLDER + 'cnn_class_training.pkl','wb'))
    # class_json = {}
    # class_json['class'] = estimator.classes_
    # writeJson_config(MODEL_FOLDER , 'cnn_class_training.json', class_json, append=False)

    def formatStr(floats):
        return ['{:.2f}'.format(x) for x in floats]

    json_hist = {}
    json_hist["acc"] = formatStr(history.history['acc'])
    json_hist["val_acc"] = formatStr(history.history['val_acc'])
    json_hist["prec"] = formatStr(history.history['prec'])
    json_hist["val_prec"] = formatStr(history.history['val_prec'])
    json_hist["rec"] = formatStr(history.history['rec'])
    json_hist["val_rec"] = formatStr(history.history['val_rec'])
    json_hist["loss"] = formatStr(history.history['loss'])
    json_hist["val_loss"] = formatStr(history.history['val_loss'])
    writeJson_config(MODEL_FOLDER + "history/" , "cnn_history_model.json", json_hist, append=False)

    json_report = {}
    json_report['confusion_matrix'] = cm_model.values.tolist()
    json_report['report'] = report_model
    writeJson_config(MODEL_FOLDER + "report/" , "cnn_report_model.json", json_report, append=False)

    return 'success'
def featureSelection():

    threshold = 0.01
    np.set_printoptions(suppress=True)

    # ----------------- READ PREPROCESSING FILE -----------------------
    VECT_SEL_FOLDER = "ml_core/vector_selection/"
    VECT_FOLDER = "ml_core/vector/"

    VECT_TEMPLATE = "ml_core/template/tfidf_sparse_template.npz"
    TEMPLATE_FOLDER = "ml_core/template/"
    FEATURE_CONFIG = "feature_template.json"

    FileName = []

    for filename in os.listdir(VECT_FOLDER):
        path = os.path.join(VECT_FOLDER, filename)
        if not os.path.isdir(path):
            strDatetime = filename.replace("tfidf_sparse_",
                                           "").replace(".npz", "")
            FileDatetime = datetime.strptime(strDatetime, "%d%m%Y_%H%M%S")
            FileName.append([filename, FileDatetime])

    # ----------------------- LOAD SPARSE MATRIX -------------------------
    FileName = sorted(FileName, key=lambda t: t[1], reverse=True)
    tfidf_mat = sparse.load_npz(VECT_FOLDER + FileName[0][0]).toarray()

    json_name = (FileName[0][0]).replace(".npz",
                                         ".json").replace("sparse", "feature")
    features = readJson_config(VECT_FOLDER + "features/", json_name,
                               'feature')[0]

    tfidf_mat_selection = None
    features_template = None
    tfidf_mat_template = None
    selected_idx = []

    now = datetime.now()
    dt_string = now.strftime("%d%m%Y_%H%M%S")

    # ------------------------------- RUN -------------------------------
    tfidf_mat_template = []
    for i in range(len(tfidf_mat)):
        tfidf_mat_template.append(sparse.load_npz(VECT_TEMPLATE).toarray()[0])

    features_template = readJson_config(TEMPLATE_FOLDER, FEATURE_CONFIG,
                                        'feature')
    print(features_template)

    for i in range(len(tfidf_mat)):
        for feature in features:
            if feature in features_template:
                idx_template = features_template.index(feature)
                idx = features.index(feature)
                tfidf_mat_template[i][idx_template] = tfidf_mat[i][idx]
                selected_idx.append(1)
            else:
                selected_idx.append(0)

    #-------------------------------- SAVE -----------------------------------
    tfidf_sparse_template = sparse.csr_matrix(tfidf_mat_template)
    sparse.save_npz(
        VECT_SEL_FOLDER + "tfidf_selection_sparse_" + dt_string + ".npz",
        tfidf_sparse_template)
    writeJson_config(VECT_SEL_FOLDER + "features/",
                     ("tfidf_sparse_" + dt_string + ".json"),
                     features_template,
                     append=False)

    # ----------------------------- LOAD DATA VIEW ----------------------------
    tableRecords = []
    for i in range(len(tfidf_mat)):
        for item in zip(features, tfidf_mat[i], selected_idx):
            tfidf_round = np.array([item[1]]).round(decimals=3)
            if item[1] != 0.0:
                tableRecords.append(
                    ['Document_' + str(i), item[0], tfidf_round[0], item[2]])

    return tableRecords
Beispiel #4
0
def featureEngineering():
    # ----------------- READ PREPROCESSING FILE -----------------------
    DATA_FOLDER = "ml_core/data/"
    VECT_FOLDER = "ml_core/vector/"
    FEATURES_FOLDER = "ml_core/vector/features/"

    FileName = []
    for filename in os.listdir(DATA_FOLDER):
        path = os.path.join(DATA_FOLDER, filename)
        if not os.path.isdir(path):
            strDatetime = filename.replace("Preprocessed_Dataset_", "").replace(".csv", "")
            FileDatetime = datetime.strptime(strDatetime, "%d%m%Y_%H%M%S")
            FileName.append([filename, FileDatetime])

    FileName = sorted(FileName, key=lambda t: t[1], reverse=True)

    TWEET_DATA = pd.read_csv(DATA_FOLDER + FileName[0][0], usecols=["tweet_tokens_stemmed"])
    TWEET_DATA.columns = ["tweet"]

    # join list of token as single document string
    def join_text_list(texts):
        texts = ast.literal_eval(texts)
        return ' '.join([text for text in texts])
    TWEET_DATA["tweet_join"] = TWEET_DATA["tweet"].apply(join_text_list)


    #------------------------- READ CONFIG ---------------------------
    ses_max_feature = readJson_config('ml_core/', 'configuration.json', 'max_features')
    max_features = int(ses_max_feature[0]) if ses_max_feature is not None else 1000

    # ------------------------- MAIN CALC ----------------------------
    # ngram_range (1, 3) to use unigram, bigram, trigram
    cvect = CountVectorizer(max_features=max_features, ngram_range=(1,1))
    counts = cvect.fit_transform(TWEET_DATA["tweet_join"])

    normalized_counts = normalize(counts, norm='l1', axis=1)

    tfidf = TfidfVectorizer(max_features=max_features, ngram_range=(1,1), smooth_idf=False)
    tfs = tfidf.fit_transform(TWEET_DATA["tweet_join"])
    tfidf_sparse = normalized_counts.multiply(tfidf.idf_)
    
    feature_name = {}
    feature_name['feature'] = tfidf.get_feature_names()

    tfidf_mat = tfidf_sparse.toarray()

    #------------------------ SAVE -----------------------------------
    now = datetime.now()
    dt_string = now.strftime("%d%m%Y_%H%M%S")
    tfidf_sparse = sparse.csr_matrix(tfidf_sparse)

    sparse.save_npz(VECT_FOLDER + "tfidf_sparse_" + dt_string + ".npz", tfidf_sparse)
    writeJson_config(FEATURES_FOLDER, ("tfidf_feature_" + dt_string + ".json"), feature_name, append=False)

    #------------------------- DATA VIEW --------------------------------
    TableRecords = []

    terms = tfidf.get_feature_names()
    TF = normalized_counts.toarray()
    IDF = tfidf.idf_
    TFIDF = tfidf_mat

    for i in range(len(TF)):
        for item in zip(terms,TF[i], IDF, TFIDF[i]):
            if item[1] != 0.0:
                Num = np.array([item[1], item[2], item[3]]).round(decimals=3)
                TableRecords.append(['Document_'+ str(i) , item[0], Num[0], Num[1], Num[2]])
            
    return TableRecords
Beispiel #5
0
def featureSelectionTraining(training=False):

    threshold = 0.01
    np.set_printoptions(suppress=True)

    # ----------------- READ PREPROCESSING FILE -----------------------
    VECT_SEL_FOLDER = "ml_core/vector_selection/training/"
    VECT_FOLDER = "ml_core/vector/training/"

    VECT_TEMPLATE = "ml_core/template/tfidf_sparse_template.npz"
    TEMPLATE_FOLDER = "ml_core/template/"
    FEATURE_CONFIG = "feature_template.json"

    LABEL_PATH = "ml_core/data/training/Preprocessed_Dataset_Training.csv"
    TWEET_DATA = pd.read_csv(LABEL_PATH, usecols=["label"])
    tags = TWEET_DATA.label

    # ----------------------- LOAD SPARSE MATRIX -------------------------
    FileName = "tfidf_sparse_training.npz"
    tfidf_mat = sparse.load_npz(VECT_FOLDER + FileName).toarray()

    json_feature = "tfidf_feature_training.json"
    features = readJson_config(VECT_FOLDER + "features/", json_feature,
                               'feature')[0]

    tfidf_mat_selection = None
    features_template = None
    tfidf_mat_template = None
    selected_idx = []

    now = datetime.now()
    dt_string = now.strftime("%d%m%Y_%H%M%S")

    # ---------------------------- TRAINING -----------------------------
    mi = mutual_info_classif(tfidf_mat, tags)
    norm_mi = mi / np.max(mi)

    column_idx = [i for i, mi_item in enumerate(norm_mi) if mi_item < 0.01]
    tfidf_mat_selection = np.delete(tfidf_mat, column_idx, 1)

    # template data
    selected_idx = [j for j in range(len(norm_mi)) if j not in column_idx]
    selected_features = []
    for idx in selected_idx:
        selected_features.append(features[idx])

    tfidf_mat_template = [0.0] * len(selected_features)
    features_template = selected_features

    #-------------------------------- SAVE -----------------------------------
    # Save template
    tfidf_sparse_template = sparse.csr_matrix(tfidf_mat_template)
    sparse.save_npz(VECT_TEMPLATE, tfidf_sparse_template)

    feature_dict = {}
    feature_dict['feature'] = features_template
    writeJson_config(TEMPLATE_FOLDER,
                     FEATURE_CONFIG,
                     feature_dict,
                     append=False)

    # save training data
    tfidf_sparse = sparse.csr_matrix(tfidf_mat_selection)
    sparse.save_npz(VECT_SEL_FOLDER + "tfidf_selection_sparse_training.npz",
                    tfidf_sparse)
    writeJson_config(VECT_SEL_FOLDER + "features/",
                     "tfidf_feature_training.json",
                     features_template,
                     append=False)

    return 'success'