Exemple #1
0
def main():

    junk_param = sys.argv[1]
    classifier_param = sys.argv[2]
    train_param = sys.argv[3]
    #split_param = sys.argv[4]

    print('Main Program Begins : ')
    write_csv.generate_inkml_file_list()
    symbol_data_obj_list, junk_data_obj_list, test_data_obj_list = parse_data.parse_data(
        junk_param)
    print(len(symbol_data_obj_list))
    print(len(junk_data_obj_list))
    print(len(test_data_obj_list))

    print('object created')
    symbol_data_obj_list = feature_extraction.get_features(
        symbol_data_obj_list, 'symbol_feature_list.csv')
    junk_data_obj_list = feature_extraction.get_features(
        junk_data_obj_list, 'junk_feature_list.csv')
    test_data_obj_list = feature_extraction.get_features(
        test_data_obj_list, 'test_feature_list.csv')
    print('Features extracted')

    prediction_file, GT_file = classification_driver.classification(
        junk_param, classifier_param, train_param)
    #Feature Extraction follows
    if (prediction_file is not None and GT_file is not None):
        command = 'python evalSymbIsole.py ' + data_folder + GT_file + ' ' + data_folder + prediction_file + ' HTML > output.html'
        #After this we can save all features in one csv as a table with final column as output(GT)
        #This will also save time for parsing ISO files again and again.
        os.system(command)
    print('Done!')
def main():
    path_train = "./Train"
    path_test = "./Test"

    N = len(os.listdir(path_train))
    # this number can be determined algebraically: (0.025 + (0.01)(x - 1) = 3 (seconds)); D = x * 13 (cepstrums)
    N_test = len(os.listdir(path_test))
    D = 299 * 13

    X, N, D = get_features(path_train)
    X_test, N_test, _ = get_features(path_test)
    y = get_labels(path_train)
    y_test = get_labels(path_test)
    write_features("./X.txt", X, N, D)
    write_features("./X_test.txt", X_test, N_test, D)
    write_labels("./y.txt", y, N)
    write_labels("./y_test.txt", y_test, N_test)

    X = read_features("./X.txt", N, D)
    X_test = read_features("./X_test.txt", N_test, D)
    y = read_labels("./y.txt", N)
    y_test = read_labels("./y_test.txt", N_test)

    # run on linear svm model
    linear_svm(path_train, path_test, X=X, X_test=X_test, y=y, y_test=y_test)
    # run on rbf kernel svm model
    rbf(path_train, path_test, X=X, X_test=X_test, y=y, y_test=y_test)
Exemple #3
0
def test_data(input_path, model_path):
    data = os.listdir(input_path)
    model = load_model(model_path)
    with open('running_time.txt', 'w') as f:
        pass

    for element in data:
        was = time()
        img_path = os.path.join(input_path, element)
        image = io.imread(img_path)
        words = get_char_images_pred(image)

        test_points = []
        for word in words:
            for char in word:
                test_points.append(get_features(char, False))

        predictions = model.predict(test_points)
        new_pred = []
        current = 0
        for i in range(len(words)):
            for _ in range(len(words[i])):
                new_pred.append(predictions[current])
                current += 1
            new_pred.append(0)
        element = element.split('.')[0]
        element = element + '.txt'
        path = os.path.join("output","text", element)
        save_predictions(new_pred, path)
        time_taken = time() - was
        with open('output/running_time.txt', 'a') as f:
            f.write(str(time_taken))
            f.write('\n')
Exemple #4
0
def get_samples_from_arr(arr, arr_len=220, step=20, n_samples=1):
    output = np.zeros((n_samples, n_features * n_channels))
    for i in range(n_samples):
        for j in range(n_channels):
            feat_list = get_features(arr[i * step:i * step + arr_len, j])
            output[i, j * n_features:(j + 1) * n_features] = feat_list
    return output
def perform_clustering(path, method, threshold):
    prefix = '/Users/lyudakopeikina/Documents/HSE_FaceRec_tf-master/facial_clustering/lfw_ytf2%s_features.npz'
    #prefix = '/Users/lyudakopeikina/Documents/HSE_FaceRec_tf-master/facial_clustering/faces/features%s.npz'
    crop_center = False
    features_file = os.path.join(path[0],
                                 prefix % (recognizer_list[recognizer_ind][1]))
    print(features_file)
    features, labels = get_features(features_file,
                                    recognizer_list[recognizer_ind][2],
                                    recognizer_list[recognizer_ind][0])
    print(len(features[0]))
    X_norm = preprocessing.normalize(features, norm='l2')

    pair_dist = pairwise_distances(X_norm)
    timer = time.time()

    clusters = clustering_results(pair_dist, method, threshold)
    timer = time.time() - timer
    print('clustering time for', method, timer)
    predictions = -np.ones(len(labels))
    for idx, cluster in enumerate(clusters):
        predictions[cluster] = idx
    idx = len(clusters)
    for i in range(len(predictions)):
        if predictions[i] == -1:
            idx += 1
            predictions[i] = idx

    num_of_classes = len(np.unique(labels))
    num_of_clusters = len(clusters)
    print('features shape:', X_norm.shape, '#classes:', num_of_classes,
          '#clusters:', num_of_clusters)
    return num_of_classes, num_of_clusters, labels, predictions
Exemple #6
0
def features_in_path(folder_path, sentiment):
    """Return features from the files in the give folder path and set sentiment"""
    all_features = []
    for filename in os.listdir(folder_path)[:1000]:
        path = folder_path + "/" + filename
        tokens = open_and_tokenize(path)
        features = get_features(tokens)
        features["filename"] = filename
        features["sentiment"] = sentiment
        all_features.append(features)
    return all_features
Exemple #7
0
def features_from_label(audio_file, segment):
    """
    Using the label, extract the features from the segment defined
    by the label.
    """
    duration = segment['end'] - segment['start']
    audio, sample_rate = librosa.core.load(audio_file,
                                           duration=duration,
                                           offset=segment['start'])
    features = fe.get_features(audio, sample_rate)
    return features
def main():
    POPULATION = 100
    MAX_ITER = 15
    MUTATION_PROB = 0.1

    global df
    df = get_features()
    global n_speakers
    n_speakers = df['a'].cat.categories.size

    # Initial population
    pop = create_initial_population(POPULATION)
    pop_ci = population_ci(pop)
    print(
        '-------------------------\tInitial population\t-------------------------'
    )
    find_best_table(pop, pop_ci)

    for iteration in range(MAX_ITER):
        print('-------------------------\tIteration ' + str(iteration + 1) +
              ' of ' + str(MAX_ITER) + '\t-------------------------')
        # Initialization
        crossover_bag = np.zeros((1, 8), dtype=np.int16)
        # Compute fitness for each table in the population
        pop_fitness = pop_ci * -1
        leveler = np.min(pop_fitness)
        pop_fitness = pop_fitness - leveler
        total_fitness = np.sum(pop_fitness)
        # Generate the crossover_bag
        for i in range(POPULATION):
            perc = pop_fitness[i] / total_fitness
            n = int(round(perc * POPULATION))
            for j in range(n):
                crossover_bag = np.vstack((crossover_bag, pop[i]))
        crossover_bag = crossover_bag[1:]
        # Crossover
        for i in range(POPULATION):
            # Randomly select 2 parents (tables)
            couple = random.sample(range(crossover_bag.shape[0]), 2)
            father = crossover_bag[couple[0]]
            mother = crossover_bag[couple[1]]
            child = crossover(father, mother)
            # Mutation
            if random.random() <= MUTATION_PROB:
                child = mutate(child)
            if i == 0:
                new_pop = child
            else:
                new_pop = np.vstack((new_pop, child))
        pop = new_pop
        # Compute CI for each table in the new population
        pop_ci = population_ci(pop)
        find_best_table(pop, pop_ci)
def process_corpus(tr_in_filename, te_in_filename, u_in_filename,
                   tr_out_filename, te_out_filename, u_out_filename):
    input_f = open(tr_in_filename, 'r')
    tr_original_corpus = pickle.load(input_f)
    input_f.close()

    input_f = open(te_in_filename, 'r')
    te_original_corpus = pickle.load(input_f)
    input_f.close()

    input_f = open(u_in_filename, 'r')
    u_original_corpus = pickle.load(input_f)
    input_f.close()
    tr_instances = [d['question'] for d in tr_original_corpus
                    if '' not in d['target']]
    te_instances = [d['question'] for d in te_original_corpus
                    if '' not in d['target']]
    u_instances = [d['question'] for d in u_original_corpus
                   if ((not 'target' in d) or '' not in d['target'])]

    vect = get_features()
    vect.fit(tr_instances + te_instances + u_instances)
    v_instances = vect.transform(tr_instances + te_instances + u_instances)
    v_instances = csr_matrix(v_instances > 0, dtype=int8)
    print v_instances.shape

    tr_corpus = Corpus()
    tr_corpus.instances = v_instances[:len(tr_instances)]
    tr_corpus.full_targets = [d['target'] for d in tr_original_corpus
                              if '' not in d['target']]
    tr_corpus.representations = [_get_repr(i[0]) for i in tr_instances]
    tr_corpus._features_vectorizer = vect
    tr_corpus.save_to_file(tr_out_filename)

    te_corpus = Corpus()
    te_corpus.instances = v_instances[:len(te_instances)]
    te_corpus.full_targets = [d['target'] for d in te_original_corpus
                              if '' not in d['target']]
    te_corpus.representations = [_get_repr(i[0]) for i in te_instances]
    te_corpus._features_vectorizer = vect
    te_corpus.save_to_file(te_out_filename)

    u_corpus = Corpus()
    u_corpus.instances = v_instances[:len(u_instances)]
    u_corpus.full_targets = [d['target']
                             if ('target' in d and '' not in d['target']) else []
                             for d in u_original_corpus]
    u_corpus.representations = [_get_repr(i[0]) for i in u_instances]
    u_corpus._features_vectorizer = vect
    u_corpus.save_to_file(u_out_filename)
Exemple #10
0
def get_segment_labels(segs, inkml_obj, detector):
    """
    Get classifications for segments.
    """
    labels = []
    for seg in segs:
        tracez = []
        for tr in seg:
            tracez.append(inkml_obj.get_trace(tr))
        features = fe.get_features(tracez)
        label, prob = detector.score_for_trace([features])
        if label == ',':
            label = 'COMMA'
        labels.append(label)
    return labels
Exemple #11
0
def predict_arcs(conll_dict, model, feature_dict):
    buff = [i for i in range(len(conll_dict['FORM']))[::-1]]
    stack, dgraph = [], []
    while (len(buff) > 0 or len(stack) > 1):
        config = (stack, buff, dgraph)
        features = get_features(config, conll_dict)
        binary_features = one_hot_encoding([features], feature_dict)
        choice = model.predict(binary_features)
        try:
            if choice == 'shift': shift(stack, buff, stack)
            elif choice == 'left_arc': left_arc(stack, buff, dgraph)
            elif choice == 'right_arc': right_arc(stack, buff, dgraph)
            else: return None
        except IndexError:
            break
    return dgraph
Exemple #12
0
def get_feature_vectors_for_training(data):
    projective_tree_count = 0
    projective_non_parsable = []
    X = []
    y = []
    for id,sent_dict in data.items():
        if len(sent_dict['FORM']) == 1:
           #Example : train file, line 97384. Text : '************************'
            continue
        gold_arcs = get_gold_arcs(sent_dict['HEAD'])
        buff = [i for i in range(len(sent_dict['FORM']))[::-1]]
        projective = is_projective(gold_arcs, len(sent_dict['FORM']))

        if not projective:
            continue
        try:
            dgraph, configurations = make_transitions(buff, oracle_std, gold_arcs)
        except IndexError:
            projective_non_parsable.append(sent_dict)
            continue

        for config in configurations:
            X.append(get_features(config[:2],sent_dict))
            y.append(config[2])
        # Root missing.
        if set(gold_arcs)-set(dgraph):
            print("Missing arcs",set(gold_arcs)-set(dgraph))
        projective_tree_count+=1

    feature_values = set([feature for row in X for feature in row])
    feature_dict = {feature: i for i, feature in enumerate(feature_values)}
    with open('feature_dict.pkl', 'wb') as f:
        pickle.dump(feature_dict,f)

    X_ = one_hot_encoding(X,feature_dict)
    a = 0.0
    for i in range(len(X_)):
        b = float(sum(X_[i]))
        c = float(len(X[i]))
        a += b
    y = np.array(y)

    print("Number of valid projection trees : "+str(projective_tree_count))
    return X_,y
def emotion_classifier(audio_source_path, storage_name, action):
    get_observed_emotions_codes = get_emotion_code_from_description(emotion_labels)(observed_emotions)
    [extraction_active, train_active] = parse_action(action)

    print("Starting...")
    print("Feature extraction: {0}".format(extraction_active))
    print("Network train: {0}".format(train_active))

    execute = (pipe
               | get_features(
                mfcc_required=True,
                chroma_required=True,
                mel_required=True,
                storage_name=storage_name,
                active=extraction_active)
               | partial(filter, filter_dataset(get_observed_emotions_codes))
               | list
               | train_network())

    execute(audio_source_path)
Exemple #14
0
def avg(training_file, submission_file, output_file):
    data = utilities.read_file(training_file)

    train_data, cv_data = preprocess.get_train_cv_data_by_chunk(data)
    targets_train, targets_cv = preprocess.get_train_cv_targets(
        train_data, cv_data)

    (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg,
     weekday_avg) = feature_extraction.get_avg_maps(train_data)

    x_train_all, x_cv_all = feature_extraction.get_x_by_avg(
        train_data, cv_data, chunk_avg, hour_avg_by_chunk,
        weekday_avg_by_chunk, hour_avg, weekday_avg)

    clfs = regression.linear_regression(x_train_all, x_cv_all, targets_train,
                                        targets_cv)
    clfs = regression.random_forest(x_train_all, x_cv_all, targets_train,
                                    targets_cv)

    print 'Filling submission file...'
    sub_data = utilities.read_file(submission_file, True)
    for i in range(1, len(sub_data)):
        chunk_id = sub_data[i][1]
        hour = sub_data[i][3]
        weekday = ''
        all_features = feature_extraction.get_features(chunk_id, weekday, hour,
                                                       chunk_avg,
                                                       hour_avg_by_chunk,
                                                       weekday_avg_by_chunk,
                                                       hour_avg, weekday_avg)

        for j in range(5, len(sub_data[i])):
            if sub_data[i][j] == '0':
                feature = []
                for f in all_features:
                    feature.append(f[j - 5])
                sub_data[i][j] = clfs[j - 5].predict([feature])[0]

    utilities.write_file(output_file, sub_data)
Exemple #15
0
def avg(training_file, submission_file, output_file):
    data = utilities.read_file(training_file)

    train_data, cv_data = preprocess.get_train_cv_data_by_chunk(data)
    targets_train, targets_cv = preprocess.get_train_cv_targets(
        train_data, cv_data)

    (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk,
     hour_avg, weekday_avg) = feature_extraction.get_avg_maps(train_data)

    x_train_all, x_cv_all = feature_extraction.get_x_by_avg(
            train_data, cv_data, chunk_avg, hour_avg_by_chunk,
             weekday_avg_by_chunk, hour_avg, weekday_avg)

    clfs = regression.linear_regression(
        x_train_all, x_cv_all, targets_train, targets_cv)
    clfs = regression.random_forest(
        x_train_all, x_cv_all, targets_train, targets_cv)

    print 'Filling submission file...'
    sub_data = utilities.read_file(submission_file, True)
    for i in range(1, len(sub_data)):
        chunk_id = sub_data[i][1]
        hour = sub_data[i][3]
        weekday = ''
        all_features = feature_extraction.get_features(
            chunk_id, weekday, hour, chunk_avg, hour_avg_by_chunk,
            weekday_avg_by_chunk, hour_avg, weekday_avg)

        for j in range(5, len(sub_data[i])):
            if sub_data[i][j] == '0':
                feature = []
                for f in all_features:
                    feature.append(f[j - 5])
                sub_data[i][j] = clfs[j - 5].predict([feature])[0]

    utilities.write_file(output_file, sub_data)
Exemple #16
0
def get_ft(dataset, id):
    print("Extracting features for: %s" % id)
    img = dataset[id].combined((Color.blue, Color.yellow, Color.red))
    return get_features(img, method=Feature.dct)
Exemple #17
0
#Hyperparameter variables go here
k = 5
k_lim = 20

#Dummy data set until we get the feature fully solved
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']
dataset = pa.read_csv(url, names=names)
#print(dataset.head())
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

# Real data for X and y
y = fe.read_instruments()
X = [fe.get_features(filename) for filename in fe.get_wav_files()]
print("X length:")
print(len(X))
print("and Y length:")
print(len(y))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

#Input data plotting
#fig = plt.figure(figsize=(6,6))
#plt.scatter(X_train, y_train, color='c', label='train')
#plt.scatter(X_test, y_test, color='m', label='test')
#plt.xlabel('x')
Exemple #18
0
def rbf(path_train,
        path_test,
        pca_comp=200,
        rbf_gamma=0.0003,
        rbf_C=10,
        X=None,
        X_test=None,
        y=None,
        y_test=None):
    '''
    This function trains and tests an svm with rbf kernel according to the data files
    specified in path_train and path_test
    :param path_train: str: path to folder with train files
    :param path_test: str: path to folder with test files
    :param pca_comp: int: number of components that pca will reduce the feature dimensions to
    :param rbf_gamma: float: gamma parameter for rbf kernel
    :param rbf_C: float: C parameters for rbf kernel
    :param X: ndarray (N,D): feature matrix to train rbf kernel with; default is None
    :param X_test: ndarray (number of test samples, D): test feature matrix to test rbf kernel; default is None
    :param y: ndarray (N,): labels for training samples; default is None
    :param y_test: ndarray (number of test samples, ): labels for test samples; default is None
    '''

    print("\n__________RBF__________")
    pca = PCA(pca_comp)
    rbf = SVC(gamma=rbf_gamma, C=rbf_C, kernel='rbf')

    print("Extracting features for training...")

    if X is None:
        # extract mfcc features for training
        X = get_features(path_train)

    # normalize the features
    x_train_mean, x_train_std = train_normalize(X)
    X = (X - x_train_mean) / x_train_std
    #reduce to pca_comp dimensions using pca
    X = pca.fit_transform(X)

    if y is None:
        y = get_labels(path_train)

    print("Training with RBF...")

    # we train using an svm with an rbf kernel, with class weights "balanced"
    rbf.fit(X, y)
    y_pred_train = rbf.predict(X)

    print("Extracting features for testing...")

    if X_test is None:
        X_test = get_features(path_test)

    # apply same normalization and pca dimensionality reduction to test feature matrix
    X_test = (X_test - (x_train_mean)) / (x_train_std)
    X_test = pca.transform(X_test)

    print("Testing with RBF...")

    if y_test is None:
        y_test = get_labels(path_test)
    y_pred = rbf.predict(X_test)

    print("\nTrain accuracy: ", np.mean(y_pred_train == y) * 100, "%", sep='')
    print("Test accuracy: ", np.mean(y_pred == y_test) * 100, "%", sep='')
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
Exemple #19
0
def linear_svm(path_train,
               path_test,
               svm_alpha=0.0001,
               lr=1e-4,
               num_iter=150,
               X=None,
               X_test=None,
               y=None,
               y_test=None):
    '''
    This function trains and tests a linear svm according to the data files
    :param path_train: str: path to folder with train files
    :param path_test: str: path to folder with test files
    :param svm_alpha: float: regularization parameter for linear svm model
    :param lr: float: learning rate parameters for svm model
    :param num_iter: maximum number of the linear svm model
    :param X: ndarray (N,D): feature matrix to train rbf kernel with; default is None
    :param X_test: ndarray (number of test samples, D): test feature matrix to test rbf kernel; default is None
    :param y: ndarray (N,): labels for training samples; default is None
    :param y_test: ndarray (number of test samples, ): labels for test samples; default is None
    '''

    print("\n__________Linear SVM__________")
    num_classes = 6
    svm = sk.linear_model.SGDClassifier(loss='hinge',
                                        penalty='l2',
                                        alpha=svm_alpha,
                                        learning_rate='constant',
                                        eta0=lr,
                                        tol=1e-5,
                                        max_iter=num_iter,
                                        early_stopping=True)

    print("Extracting features for training...")

    if X is None:
        # extract mfcc features for training
        X = get_features(path_train)

    # normalize the features
    x_train_mean, x_train_std = train_normalize(X)
    X = (X - x_train_mean) / x_train_std

    if y is None:
        y = get_labels(path_train)

    print("Training with Linear SVM...")

    # initialize W and b to small random weights
    W = np.random.uniform(-0.001, 0.001, (num_classes, 299 * 13))
    b = np.random.uniform(-0.001, 0.001, (num_classes))
    # we train using a linear svm
    svm.fit(X, y, W, b)
    y_pred_train = svm.predict(X)

    print("Extracting features for testing...")

    if X_test is None:
        X_test = get_features(path_test)

    # apply same normalization to test feature matrix
    X_test = (X_test - (x_train_mean)) / (x_train_std)

    print("Testing with Linear SVM...")

    if y_test is None:
        y_test = get_labels(path_test)
    y_pred = svm.predict(X_test)

    print("\nTrain accuracy: ", np.mean(y_pred_train == y) * 100, "%", sep='')
    print("Test accuracy: ", np.mean(y_pred == y_test) * 100, "%", sep='')
Exemple #20
0
import os

import numpy as np
import keras
from tqdm import tqdm as tqdm
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD

import feature_extraction

if __name__ == '__main__':
    img_root = r'G:\Workspace\DS&Alg-Project1-Release\data\image'
    images, features = feature_extraction.get_features(img_root)
    images_categories = list({img.split('_')[0] for img in images})
    category_map = {
        images_categories[i]: i
        for i in range(len(images_categories))
    }

    x_train = features
    labels = np.array(
        list(map(lambda x: category_map[x.split('_')[0]], images)))
    y_train = keras.utils.to_categorical(labels.reshape((x_train.shape[0], 1)),
                                         num_classes=len(images_categories))
    model = Sequential()

    #! Length hard coded.
    model.add(Dense(512, activation='relu', input_dim=7 * 7 * 512))
    model.add(Dropout(0.5))
 def parallel(i):
     features = get_features(*i)
     features.pop("label")
     return pd.DataFrame(features, index=[0])
Exemple #22
0
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer
import pickle
import feature_extraction

pos_data = np.load('sarcasm_array.npy')
neg_data = np.load('non_sarcasm_array.npy')

#print 'Number of  sarcastic tweets :', len(pos_data)
#print 'Number of  non-sarcastic tweets :', len(neg_data)

cls_set = ['Non-Sarcastic', 'Sarcastic']
features = []

for tweet in pos_data:
    features.append((feature_extraction.get_features(tweet), cls_set[1]))

for tweet in neg_data:
    features.append((feature_extraction.get_features(tweet), cls_set[0]))

features = np.array(features)
targets = (features[0::, 1] == 'Sarcastic').astype(int)

vec = DictVectorizer()
features_vec = vec.fit_transform(features[0::, 0])
pickle.dump(vec, open("features_dict.pkl", 'wb'))

order = shuffle(range(len(features)))
targets = targets[order]
features_vec = features_vec[order, 0::]
Exemple #23
0
def train_and_test(data_dir=DATA_DIR,
                   results_dir=RESULTS_DIR,
                   feature_types=FEATURE_TYPES):
    p = Path(data_dir)
    y_train_ = pd.read_csv(os.path.join(DATA_DIR, 'y_train.csv'),
                           index_col=0,
                           header=None,
                           squeeze=True)
    y_train_ = y_train_.map(LANGUAGES)
    y_test_ = pd.read_csv(os.path.join(DATA_DIR, 'y_test.csv'),
                          index_col=0,
                          header=None,
                          squeeze=True)
    y_test_ = y_test_.map(LANGUAGES)
    results = _recursive_defaultdict()

    for dir_ in [d for d in p.iterdir() if d.is_dir()]:

        # load the relevant preprocessed data
        print(f"loading X_train for {dir_.name}")
        X_train = pd.read_csv(os.path.join(dir_, 'X_train.csv'), index_col=0)
        print(f"indexing y_train for {dir_.name}")
        y_train = y_train_.loc[X_train.index]
        results[dir_.name]['y_train'] = y_train
        print(f"loading X_test for {dir_.name}")
        X_test = pd.read_csv(os.path.join(dir_, 'X_test.csv'), index_col=0)
        print(f"indexing y_test for {dir_.name}")
        y_test = y_test_.loc[X_test.index]
        results[dir_.name]['y_test'] = y_test

        for feature_type in feature_types:
            # select features to use in training
            features = get_features(X_train, type_=feature_type)
            print(
                f"using features `{feature_type}` with preprocessing `{dir_.name}`"
            )
            train_data = lgb.Dataset(features, label=y_train)

            # tune hyper parameters and save them
            best_score, best_hyperparams = tune_hyperparams(train_data)
            results[dir_.name][feature_type]['cv_error'] = best_score
            results[dir_.name][feature_type]['params'] = best_hyperparams

            # train
            print(
                f"training using features `{feature_type}` with preprocessing `{dir_.name}`"
            )
            print(f"training params:\n {best_hyperparams}")
            booster = lgb.train(best_hyperparams, train_data)

            # calculate training metrics
            train_predictions = booster.predict(features).argmax(axis=1)
            train_predictions = pd.Series(train_predictions,
                                          index=X_train.index)
            results[dir_.name][feature_type]['train_pred'] = train_predictions
            results[dir_.name][feature_type]['train_error'] = np.mean(
                train_predictions != y_train)
            print(
                f"training error is {results[dir_.name][feature_type]['train_error']}"
            )

            # calculate test metrics
            test_features = get_features(X_test, type_=feature_type)
            test_predictions = booster.predict(test_features).argmax(axis=1)
            test_predictions = pd.Series(test_predictions, index=X_test.index)
            results[dir_.name][feature_type]['test_pred'] = test_predictions
            results[dir_.name][feature_type]['test_error'] = np.mean(
                test_predictions != y_test)
            print(
                f"test error is {results[dir_.name][feature_type]['test_error']}"
            )
            # save the model
            models_dir = os.path.join(dir_, MODELS_DIR)
            os.makedirs(models_dir, exist_ok=True)
            booster.save_model(
                os.path.join(models_dir, f"{feature_type}_model.txt"))

    # save the results
    os.makedirs(results_dir, exist_ok=True)
    with open(os.path.join(results_dir, 'results.pkl'), 'wb') as handle:
        pickle.dump(results, handle)

    return results
        print(original_data.question1[index])
        counter = counter + 1
        if counter == count:
            return


if __name__ == '__main__':
    best_iteration = 111
    question = "What are the best ways to loose weight?"
    prepaire_traindata = True
    #prepaire_traindata = False

    start = clock()
    if prepaire_traindata:
        #prepare traindata
        #feature_extraction.extract_features(False, True)
        feature_selection.select_features_at(best_iteration)
        model_training.train_and_save_model()

        generate_tmp_data(question)

        train_features = feature_extraction.get_features(data_paths.tmp)
        train_features.to_csv(data_paths.tmp_features, index=False)

        prediction.predict_and_write_data(data_paths.tmp,
                                          data_paths.tmp_features,
                                          data_paths.tmp_submission)

    get_top_questions(30)

    print('Overall duration: ', round(clock() - start, 0), 'seconds')
Exemple #25
0
def predictions(url):
    ff = fe.get_features(url)
    prob = RF_clf.predict_proba(ff)[0][1] * 100

    return prob, ff
Exemple #26
0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

# Importing data cleaning and feature extraction functions
from data_cleaning import clean_words
from feature_extraction import get_features

# Datasets path
true_dataset_path = 'dataset/True.csv'
fake_dataset_path = 'dataset/Fake.csv'

# Data pre-processing
df = get_features(true_dataset_path, fake_dataset_path)

# Cleaning text Data
df['total'] = df['total'].apply(clean_words)

# Defining x and y as feature and label respectively
x = df['total']
y = df['label']

# Train-Test splitting
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size =0.25)


# Logistic Regression
LR = Pipeline([('TFIDF_Vectorizer', TfidfVectorizer()),
               ('Logistic_Regression', LogisticRegression(n_jobs=-1))])