Ejemplo n.º 1
0
def extract_features(train_dir, test_dir=""):
    print_message("Extracting Features")

    train_data, train_labels, test_data, test_labels = get_data(
        test_dir, train_dir)
    glbs.LABELS = train_labels + test_labels
    glbs.TRAIN_DATA = train_data

    feature_lst = []
    # add all the N-Grams feature to the list
    for feature in glbs.FEATURES:
        if is_ngrams(feature):
            vectorizer = get_vectorizer(feature)
            feature_lst = add_feature(feature_lst, feature, vectorizer)
    # add all the stylistic features to the list
    for feature in glbs.STYLISTIC_FEATURES:
        vectorizers = get_stylistic_features_vectorizer(feature)
        for i in range(len(vectorizers)):
            feature_lst = add_feature(feature_lst, feature + str(i),
                                      vectorizers[i])
    # convert the list to one vectoriazer using FeatureUnion

    all_features = FeatureUnion(feature_lst)
    train_features = all_features.fit_transform(train_data)

    test_features = all_features.transform(test_data)

    return train_features, train_labels, test_features, test_labels, all_features
Ejemplo n.º 2
0
def classify(train,
             tr_labels,
             test,
             ts_labels,
             all_features,
             num_iteration=1,
             model_number=0):
    results = {}
    result = []
    le = LabelEncoder()
    le.fit(tr_labels)
    ts_labels = le.transform(ts_labels)
    tr_labels = le.transform(tr_labels)
    print_message("Classifying")

    # if os.path.exists(temp_file_path):
    #  results = load_backup_file(temp_file_path)
    for classifier in glbs.METHODS:
        print_message("running " + str(classifier), num_tabs=1)
        if classifier in results.keys():
            continue

        if classifier == "rnn":
            clf = get_rnn_model(train)
            clf.fit(train, tr_labels, epochs=3, batch_size=64)
        # scores = clf.evaluate(test, ts_labels, verbose=0)
        # acc_score += scores[1]
        else:
            clf = methods[classifier]
            clf.fit(train, tr_labels)
            prediction = clf.predict(test)
            decision = []
            try:
                decision = clf.decision_function(test)
            except:
                decision = clf.predict_proba(test)
                decision = decision[:, 1]

            result = get_results(ts_labels, prediction, decision)

        model_path = os.path.join(glbs.RESULTS_PATH,
                                  "Model " + classifier + str(model_number))
        save_model(clf, model_path)
        del clf

        results[classifier] = result
        # save_backup_file(results, temp_file_path)
    # print(results)
    return results
Ejemplo n.º 3
0
def extract_features(dataset_dir):
    print_message("Extracting Features")

    X, y = get_data(dataset_dir)
    glbs.LABELS = y
    glbs.DATASET_DATA = X

    ########################################
    # X, y = zip(*list(zip(X, y))[:160])
    # from help_functions import get_fetuer_by_DF

    # get_fetuer_by_DF(X)
    ########################################

    feature_lst = []
    # add all the N-Grams feature to the list
    for feature in glbs.FEATURES:
        if is_ngrams(feature):
            vectorizer = get_vectorizer(feature)
            feature_lst = add_feature(feature_lst, feature, vectorizer)
    # add all the stylistic features to the list
    for feature in glbs.STYLISTIC_FEATURES:
        vectorizers = get_stylistic_features_vectorizer(feature)
        for i in range(len(vectorizers)):
            feature_lst = add_feature(feature_lst, feature + str(i),
                                      vectorizers[i])

    # convert the list to one vectoriazer using FeatureUnion
    if glbs.MULTIPROCESSING:
        n_jobs = -1
    else:
        n_jobs = None
    all_features = FeatureUnion(feature_lst, n_jobs=n_jobs)

    glbs.FEATURE_MODEL.append(all_features)

    all_features.fit(X, y)
    glbs.NUM_OF_FEATURE = len(all_features.get_feature_names())

    if glbs.SELECTION:
        from feature_selction import get_selected_features

        get_selected_features(X, y, all_features)

    return X, y
Ejemplo n.º 4
0
def write_results(results):
    glbs = GlobalParameters()
    print_message("Writing results...")

    pickle_path = glbs.RESULTS_PATH + "\\Pickle files"
    if path.exists(pickle_path):
        shutil.rmtree(pickle_path, ignore_errors=True)
    os.makedirs(pickle_path)

    xlsx_path = glbs.RESULTS_PATH + "\\Xlsx files"
    if path.exists(xlsx_path):
        shutil.rmtree(xlsx_path, ignore_errors=True)
    time.sleep(0.5)
    os.makedirs(xlsx_path)

    for key in results.keys():
        with open(pickle_path + "\\" + key + ".pickle", "wb+") as file:
            pickle.dump(results[key], file)
        new_write_file_content(pickle_path + "\\" + key + ".pickle", key, xlsx_path)
Ejemplo n.º 5
0
def write_results(results):
    glbs = GlobalParameters()
    print_message("Writing results...")
    # add_to_csv(results, glbs.RESULTS_PATH)

    pickle_path = os.path.join(glbs.RESULTS_PATH, "Pickle files")
    if path.exists(pickle_path):
        shutil.rmtree(pickle_path, ignore_errors=True)
    os.makedirs(pickle_path)

    xlsx_path = os.path.join(glbs.RESULTS_PATH, "Xlsx files")
    if path.exists(xlsx_path):
        shutil.rmtree(xlsx_path, ignore_errors=True)
    time.sleep(0.5)
    os.makedirs(xlsx_path)

    for key in results.keys():
        with open(os.path.join(pickle_path, key) + ".pickle", "wb+") as file:
            pickle.dump(results[key], file)
        new_write_file_content(
            os.path.join(pickle_path, key) + ".pickle", key, xlsx_path)
Ejemplo n.º 6
0
def main(cfg):
    try:
        # nltk.download("vader_lexicon")
        # nltk.download('wordnet')
        glbs = GlobalParameters()
        configs = get_cfg_files(cfg)
        total_files = len(configs)
        results = {}
        for i, config in enumerate(configs):
            print_message("Running config {}/{}".format(i + 1, total_files))
            set_global_parameters(config)
            print_run_details()
            dataset_dir = normalize()
            X, y = extract_features(dataset_dir)
            config_result = classify(X, y, glbs.K_FOLDS, glbs.ITERATIONS)
            glbs.RESULTS[glbs.FILE_NAME] = config_result
            glbs.RESULTS = add_results(glbs.RESULTS, glbs)
            if glbs.EXPORT_AS_BASELINE:
                export_as_baseline(config_result, config[1])
        if glbs.WORDCLOUD:
            print_message("Generating word clouds (long processes)")
            generate_word_clouds()
        add_results_glbs(results, glbs)
        write_results(divide_results(glbs.RESULTS))
        send_work_done(glbs.DATASET_DIR)
        print_message("Done!")
    except Exception as e:
        traceback.print_exc()
        send_work_done(glbs.DATASET_DIR,
                       "",
                       error=str(e),
                       traceback=str(traceback.format_exc()))
Ejemplo n.º 7
0
def normalize(test=False):
    init_tools()
    n = glbs.NORMALIZATION
    if not test:
        i = glbs.TRAIN_DIR
        print_message("Normalizing")
    else:
        i = glbs.TEST_DIR
        print_message("Normalizing test dataset")
    if not is_nargs_empty(n):
        # the normalized folder
        parent_dir = i + "@" + n
        # create the dir if does not exist
        if not os.path.exists(parent_dir):
            os.mkdir(parent_dir)
            for category in os.listdir(i):
                with open(os.path.join(i, category),
                          'r',
                          encoding='utf8',
                          errors='ignore') as read:
                    n_lines = []
                    for line in read:
                        line = line.rstrip('\n')
                        n_lines.append(normal(line, n))
                n_file = '\n'.join(n_lines)
                del n_lines
                # write normalized_file
                with open(os.path.join(parent_dir, category),
                          'w+',
                          encoding='utf8') as write:
                    write.write(n_file)
                del n_file
        else:  # if it does exist
            print_message("found normalized dataset")
            # check if both dirs have the same number of files
            if not len(os.listdir(parent_dir)) == len(os.listdir(i)):
                # delete normalized folder and create a new one.
                print_message(
                    "Corrupted normalization found, deleting and starting over..."
                )
                shutil.rmtree(parent_dir, ignore_errors=True)
                normalize()

        glbsNORM_PATH = parent_dir
    else:
        glbsNORM_PATH = i

    return glbsNORM_PATH
Ejemplo n.º 8
0
def selectionHalfMethod(X, y, all_features):
    glbs = GlobalParameters()
    filename = glbs.FILE_NAME
    results = {}
    # nxt = (glbs.SELECTION[0][0], int(glbs.SELECTION[0][1]))
    nxt = (glbs.SELECTION[0][0], int(glbs.SELECTION[0][1]))
    max_last_result = 0
    bottom = (0, 0)
    top = nxt
    while top != bottom:
        max_nxt_result = 0
        print_message(nxt[0])
        print_message(nxt[1])
        glbs.FILE_NAME = glbs.FILE_NAME + str(nxt[1])
        select = select_k_best(nxt[0], int(nxt[1]))
        glbs.FEATURE_MODEL[1] = select
        results[glbs.FILE_NAME] = classify(X, y, glbs.K_FOLDS, glbs.ITERATIONS)
        for method in results[glbs.FILE_NAME].items():
            if mean(method[1]["accuracy"]) > max_nxt_result:
                max_nxt_result = mean(method[1]["accuracy"])
        results = add_results(results, glbs, nxt)
        if max_nxt_result >= max_last_result:
            top = nxt
            if bottom[1] == 0:
                nxt = (nxt[0], int(int(nxt[1]) / 2))
            if bottom[1] != 0:
                nxt = (nxt[0], int((int(nxt[1]) + bottom[1]) / 2))
            max_last_result = max_nxt_result
        elif max_nxt_result < max_last_result:
            bottom = nxt
            nxt = (nxt[0], int((top[1] + bottom[1]) / 2))
        glbs.SELECTION[0] = nxt
        if bottom[1] - top[1] == -1 and bottom == nxt:
            break
    glbs.FILE_NAME = filename
    add_results_glbs(results, glbs)
Ejemplo n.º 9
0
def main(cfg):
    try:
        glbs = GlobalParameters()
        configs = get_cfg_files(cfg)
        results = {}
        n_test_dir = ""
        total_files = len(configs)
        for i, config in enumerate(configs):
            print_message("Running config {}/{}".format(i + 1, total_files))
            set_global_parameters(config)
            print_run_details()
            n_train_dir = normalize()
            if glbs.TEST_DIR != "":
                n_test_dir = normalize(test=True)
            train, tr_labels, test, ts_labels, all_features = extract_features(
                n_train_dir, n_test_dir)
            for selection in glbs.SELECTION:
                try:
                    train, test = get_selected_features(
                        selection, train, tr_labels, test, ts_labels,
                        all_features)
                except:
                    pass
            results[glbs.FILE_NAME] = classify(train,
                                               tr_labels,
                                               test,
                                               ts_labels,
                                               all_features,
                                               model_number=i)
            results = add_results(results)
        if glbs.WORDCLOUD:
            print_message("Generating word clouds (long processes)")
            generate_word_clouds()
        write_results(divide_results(results))
        send_work_done(glbs.TRAIN_DIR)
        print_message("Done!")
        # clean_backup_files()
    except Exception as e:
        traceback.print_exc()
        send_work_done(glbs.TRAIN_DIR,
                       "",
                       error=str(e),
                       traceback=str(traceback.format_exc()))
def classify(X, y, k_fold, num_iteration=1):
    results = {}
    le = LabelEncoder()
    le.fit(y)
    y = le.transform(y)
    print_message("Classifying")

    """def cross_validation(X, classifier, clf, i, k_fold, num_iteration, y):
        print_message("iteration " + str(i + 1) + "/" + str(num_iteration), 2)
        scores = cross_validate(clf, X, y, cv=k_fold, scoring=glbs.MEASURE)
        for measure in glbs.MEASURE:
            if measure in results[classifier].keys():
                results[classifier][measure] += list(scores['test_' + measure])
            else:
                results[classifier][measure] = list(scores['test_' + measure])"""

    for classifier in glbs.METHODS:
        print_message("running " + str(classifier), num_tabs=1)
        if classifier not in results.keys():
            results[classifier] = {}

        if classifier == "rnn":
            # clf = get_rnn_model(X)
            continue
        else:
            clf = methods[classifier]
        lst = []
        if glbs.SELECTION:
            lst.append(("feture", glbs.FEATURE_MODEL[0]))
            lst.append(("select", glbs.FEATURE_MODEL[1]))
            lst.append(("classifier", clf))
            clf = Pipeline(lst)

        else:
            clf = make_pipeline(glbs.FEATURE_MODEL[0], clf)

        if glbs.MULTIPROCESSING:
            n_jobs = -1
        else:
            n_jobs = None

        ####################################################################
        # Used for parameters tuning
        from sklearn.model_selection import RandomizedSearchCV
        # Number of trees in random forest
        n_estimators = [int(x) for x in np.linspace(start=1000, stop=2000, num=10)]
        # Number of features to consider at every split
        max_features = ['auto', 'sqrt']
        # Maximum number of levels in tree
        max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
        max_depth.append(None)
        # Minimum number of samples required to split a node
        min_samples_split = [2, 5, 10, 15]
        # Minimum number of samples required at each leaf node
        min_samples_leaf = [1, 2, 4, 6]
        # Method of selecting samples for training each tree
        bootstrap = [True, False]
        # Create the random grid
        random_grid = {'classifier__n_estimators': n_estimators,
                       'classifier__max_features': max_features,
                       'classifier__max_depth': max_depth,
                       'classifier__min_samples_split': min_samples_split,
                       'classifier__min_samples_leaf': min_samples_leaf,
                       'classifier__bootstrap': bootstrap}
        print(random_grid)

        # Use the random grid to search for best hyperparameters
        # Random search of parameters, using 3 fold cross validation,
        # search across 100 different combinations, and use all available cores
        rf_random = RandomizedSearchCV(estimator=clf, param_distributions=random_grid, n_iter=100, cv=k_fold, verbose=2,
                                       random_state=42, n_jobs=-1)

        print(clf.get_params().keys())
        # Fit the random search model
        rf_random.fit(X, y)

        print(rf_random.best_params_)
        clf = rf_random.best_estimator_
        with open("best_estimator.json", "w") as file:
            json.dump(rf_random.best_params_, file, indent=6)

        """best_random = rf_random.best_estimator_
        scores = cross_validate(best_random, X, y, cv=k_fold, scoring=glbs.MEASURE, n_jobs=n_jobs)
        random_accuracy = numpy.mean(list(scores["test_accuracy"]))

        base_model = clf
        scores = cross_validate(base_model, X, y, cv=k_fold, scoring=glbs.MEASURE, n_jobs=n_jobs)
        base_accuracy = numpy.mean(list(scores["test_accuracy"]))

        print('Base accuracy: {:0.2f}%.'.format(100 * base_accuracy))
        print('Random accuracy: {:0.2f}%.'.format(100 * random_accuracy))

        print('Improvement of {:0.2f}%.'.format(100 * (random_accuracy - base_accuracy) / base_accuracy))

        stop = "Put breakpoint here"""""
        ####################################################################

        for i in range(num_iteration):
            # Shuffle the inner order of the posts within each fold
            Xy = list(zip(X, y))
            splited = []
            len_l = len(Xy)
            for j in range(k_fold):
                start = int(j * len_l / k_fold)
                end = int((j + 1) * len_l / k_fold)
                splited.append(Xy[start:end])
            Xy = []
            for fold in splited:
                random.Random(num_iteration).shuffle(fold)
                Xy += fold
            X, y = zip(*Xy)

            print_message("iteration " + str(i + 1) + "/" + str(num_iteration), 2)
            scores = cross_validate(
                clf, X, y, cv=k_fold, scoring=glbs.MEASURE, n_jobs=n_jobs
            )
            for measure in glbs.MEASURE:
                if measure in results[classifier].keys():
                    results[classifier][measure] += list(scores["test_" + measure])
                else:
                    results[classifier][measure] = list(scores["test_" + measure])

        del clf
    return results
Ejemplo n.º 11
0
def clean_backup_files():
    glbs = GlobalParameters()
    print_message("removing temp files...")
    folder_path = os.sep.join(glbs.RESULTS_PATH.split(
        os.sep)[:-1]) + os.sep + "temp_backups"
    shutil.rmtree(folder_path, ignore_errors=True)
def classify(X, y, k_fold, num_iteration=1):
    results = {}
    le = LabelEncoder()
    le.fit(y)
    y = le.transform(y)
    print_message("Classifying")
    """def cross_validation(X, classifier, clf, i, k_fold, num_iteration, y):
        print_message("iteration " + str(i + 1) + "/" + str(num_iteration), 2)
        scores = cross_validate(clf, X, y, cv=k_fold, scoring=glbs.MEASURE)
        for measure in glbs.MEASURE:
            if measure in results[classifier].keys():
                results[classifier][measure] += list(scores['test_' + measure])
            else:
                results[classifier][measure] = list(scores['test_' + measure])"""

    for classifier in glbs.METHODS:
        print_message("running " + str(classifier), num_tabs=1)
        if classifier not in results.keys():
            results[classifier] = {}

        if classifier == "rnn":
            # clf = get_rnn_model(X)
            continue
        else:
            clf = methods[classifier]
        lst = []
        if glbs.SELECTION:
            lst.append(("feture", glbs.FEATURE_MODEL[0]))
            lst.append(("select", glbs.FEATURE_MODEL[1]))
            lst.append(("classifier", clf))
            clf = Pipeline(lst)
            A = clf.fit(X, y)
            print(A)

        else:
            clf = make_pipeline(glbs.FEATURE_MODEL[0], clf)

        if glbs.MULTIPROCESSING:
            n_jobs = -1
        else:
            n_jobs = None

        for i in range(num_iteration):
            # Shuffle  the inner order of the posts within each fold
            Xy = list(zip(X, y))
            splited = []
            len_l = len(Xy)
            for j in range(k_fold):
                start = int(j * len_l / k_fold)
                end = int((j + 1) * len_l / k_fold)
                splited.append(Xy[start:end])
            Xy = []
            for fold in splited:
                random.Random(num_iteration).shuffle(fold)
                Xy += fold
            X, y = zip(*Xy)

            print_message("iteration " + str(i + 1) + "/" + str(num_iteration),
                          2)
            scores = cross_validate(clf,
                                    X,
                                    y,
                                    cv=k_fold,
                                    scoring=glbs.MEASURE,
                                    n_jobs=n_jobs)
            for measure in glbs.MEASURE:
                if measure in results[classifier].keys():
                    results[classifier][measure] += list(scores["test_" +
                                                                measure])
                else:
                    results[classifier][measure] = list(scores["test_" +
                                                               measure])

        del clf
    return results