Ejemplo n.º 1
0
"""

import numpy as np
import pickle
from import_data import load_pickle
from features import simple_features, feature_array

folders = [
    'MM09', 'MM10', 'MM11', 'MM12', 'MM14', 'MM15', 'MM16', 'MM18', 'MM19',
    'MM20', 'MM21'
]
path = "C:/Users\SB00745777\OneDrive - Ulster University\KaraOne\Data/"

for f in folders:
    new_path = path + f
    data = load_pickle(new_path, "window_data.p")
    labels = load_data(new_path, "labels", "labels")
    feature_vector = []
    for tr in data:

        wdw_mn, wdw_absmn, wdw_sm, wdw_sd, wdw_md, wdw_vr, wdw_mx, wdw_absmx, wdw_mnm, wdw_absmin, wdw_mxmn, wdw_mnmx = (
            [] for i in range(12))
        for wdw in tr:
            ch_mn, ch_absmn, ch_sm, ch_sd, ch_md, ch_vr, ch_mx, ch_absmx, ch_mnm, ch_absmin, ch_mxmn, ch_mnmx = (
                [] for i in range(12))
            for ch in wdw:
                mn, absmn, sd, sm, md, vr, mx, absmx, mnm, absmin, mxmn, mnmx = simple_features(
                    ch)
                ch_mn.append(mn)
                ch_absmn.append(absmn)
                ch_sm.append(sm)
Ejemplo n.º 2
0
import pandas as pd
import time as time

# from neural_network_classifier import run_neural_network
# from xgboost_classifier import run_xgboost
from import_data import load_pickle
from feature_extraction import extract_features
# from svm_classifier import run_svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from neural_network_classifier import magic

train1_df = load_pickle('data/train1.pkl')
test1_df = load_pickle('data/test1.pkl')
train2_df = load_pickle('data/train2.pkl')
test2_df = load_pickle('data/test2.pkl')

# charf_tr1 = extract_features(train1_df, chars=True)
# wordf_tr1 = extract_features(train1_df, words=True)
# posf_tr1 = extract_features(train1_df, pos_tags = True)
# posf_ts1 = extract_features(test1_df, pos_tags=True)

# print("svm w/ countvectorizer on n-gram chars")

tr1_char_1 = extract_features(
    train1_df,
    TfidfVectorizer(analyzer="char", ngram_range=(1, 1), binary=False))
tr1_char_2 = extract_features(
    train1_df,
    TfidfVectorizer(analyzer="char", ngram_range=(2, 2), binary=False))
tr1_char_3 = extract_features(
    train1_df,
Ejemplo n.º 3
0
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score
from import_data import load_pickle
from feature_extraction import extract_features
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

train1_df = load_pickle('data/train1.pkl')

count_tr1_char_2 = extract_features(
    train1_df,
    TfidfVectorizer(analyzer="word", ngram_range=(3, 3), binary=False))


def run_svm(df):
    # casting X as list
    X = df["text"].tolist()

    # labelencoding and casting y as list
    enc = LabelEncoder()
    y = df["author"].tolist()
    y = enc.fit_transform(y).tolist()

    clf = SVC(gamma="auto", kernel="linear")
    score = cross_val_score(clf, X, y, cv=5)

    return [score, score.mean(), score.std()]


print(run_svm(count_tr1_char_2))
Ejemplo n.º 4
0
from sklearn.multiclass import OneVsRestClassifier
folders = [
    'MM09', 'MM10', 'MM11', 'MM12', 'MM14', 'MM15', 'MM16', 'MM18', 'MM19',
    'MM20', 'MM21'
]  #folder names
path = "C:/Users\cfcoo\OneDrive - Ulster University\KaraOne\Data/"
classes = [
    '/uw/', '/tiy/', '/iy/', '/m/', '/n/', '/piy/', '/diy/', 'gnaw', 'pat',
    'knew', 'pot'
]  #class labels

classifier_scores = pd.DataFrame()  #DataFrame for saving CV scores

for f in folders:
    new_path = path + f
    data = load_pickle(new_path, "td_df.p")
    features = data.Features.tolist()
    features = np.array(features)
    targets = np.array(data.Targets)

    class_names = order_class_labels(
        classes,
        targets)  #orders class_names in the order they first appear in dataset
    X_Train, X_Test, y_train, y_test = train_test_split(features,
                                                        targets,
                                                        test_size=0.2,
                                                        random_state=42)

    clf = SVC(kernel='linear', class_weight='balanced')
    pca = PCA(svd_solver='randomized', whiten=True)