def train_original():
    fake, real = data_processing.load_data()
    data, keywords = data_processing.process_data(fake, real)
    training_set = data_processing.Headlines(data[0])
    validation_set = data_processing.Headlines(data[1])
    testing_set = data_processing.Headlines(data[2])
    print('Data Loaded')
    model = classifiers.ConvnetClassifier(len(keywords),
                                          data[0][0][0].shape[1]).cuda()
    loss_fn = torch.nn.CrossEntropyLoss().cuda()
    training_loss, validation_loss = train.train_classifier(model,
                                                            loss_fn,
                                                            training_set,
                                                            validation_set,
                                                            patience=3)
    plt.plot(training_loss)
    plt.plot(validation_loss)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend(('Training Set', 'Validation Set'))
    plt.savefig('error_orig.png')
    plt.show()
    torch.save(model.state_dict(), 'model_orig.pkl')
    model.eval()
    print('Acheived {:%} accuracy on the training set.'.format(
        train.get_accuracy(model, training_set)))
    print('Acheived {:%} accuracy on the validation set.'.format(
        train.get_accuracy(model, validation_set)))
    print('Acheived {:%} accuracy on the testing set.'.format(
        train.get_accuracy(model, testing_set)))
Exemple #2
0
def main():
    torch.manual_seed(24)
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    print(device)
    data_dir = './clean_data/full'
    model_name = '3LayerModel'
    word_length = 8

    corpus, labels, vector_length, _, label_dict = process_data(
        data_dir, word_length=word_length)

    num_languages = len(label_dict.keys())

    for i in range(num_languages):
        count = len([l for l in labels if l == i])
        print(f'{label_dict[i]} word count = {count}')
    print(f'Vector length = {vector_length}')

    model = LanguageNet(vector_length, word_length, num_languages)
    model = model.to(device)
    # model = ConvLanguageNet(vector_length, word_length, num_languages)
    model.train()

    training_loop(model, corpus, labels, model_name='Linear', device=device)

    model = ConvLanguageNet(vector_length, word_length, num_languages)
    model = model.to(device)
    model.train()

    training_loop(model,
                  corpus,
                  labels,
                  model_name='Convolution',
                  device=device)
Exemple #3
0
def main(filepath, modelpath):

    """ main function 

    Args:
        filepath ([txt]): filepath containing our dataset
        modelpath ([h5]): modelpath containing our model
    """
    df, eng_deu_lines = read_file(filepath)
    dfp = process_data(df)
    eng_deu = data_array(dfp)
    eng_tok = token(eng_deu[:, 0])
    eng_len_vocab = len(eng_tok.word_index) + 1
    deu_tok = token((eng_deu[:, 1]))
    deu_len_vocab = len(deu_tok.word_index) + 1
    data_train, data_test = train_test_split(eng_deu, test_size=0.2,  
                                             random_state=1)
    X_train = encoding(data_train[:, 1], 8, deu_tok)
    y_train = encoding(data_train[:, 0], 8, eng_tok)
    X_test = encoding(data_test[:, 1], 8, deu_tok)
    y_test = encoding(data_test[:, 1], 8, deu_tok)
    model = load_model(modelpath)
    preds = model.predict_classes(X_test.reshape((X_test.shape[0], 
                                                 X_test.shape[1])))
    df_preds = prediction(preds, eng_tok)
    return df_preds
Exemple #4
0
def main(trainPath,
         testPath,
         submissionPath,
         processData=True,
         X_train=None,
         X_test=None,
         y_train=None,
         y_test=None,
         X_submission_df=None,
         X_submission=None):
    max_score = 0
    iter = 0
    if processData:
        X_train, X_test, y_train, y_test, X_submission_df, X_submission = process_data(
            trainPath, testPath)
    for description, model in models.items():
        print(description)
        print(model.fitModel(X_train, y_train))

        score = model.getTestScore(X_test, y_test)

        if score > max_score:
            max_score = score

        print(score)
        model.evaluateModel(X_submission, X_submission_df, submissionPath,
                            iter)
        iter += 1
    return max_score
def largest_activations():
    fake, real = data_processing.load_data()
    _, keywords = data_processing.process_data(fake, real)
    model = classifiers.ConvnetClassifier(len(keywords), 40)
    model.load_state_dict(torch.load('model_orig.pkl'))
    weights = model.classifier[1].weight.data.numpy()

    print("Real sequences")
    most_real = np.argsort(weights[0])[-10:]
    for most in most_real:
        if most < 100:
            conv = model.features3[0].weight.data.numpy()[most]
        elif most < 200:
            conv = model.features4[0].weight.data.numpy()[most - 100]
        else:
            conv = model.features5[0].weight.data.numpy()[most - 200]
        print(*keywords[np.argmax(conv, 0)])

    print("Fake sequences")
    most_fake = np.argsort(weights[1])[-10:]
    for most in most_fake:
        if most < 100:
            conv = model.features3[0].weight.data.numpy()[most]
        elif most < 200:
            conv = model.features4[0].weight.data.numpy()[most - 100]
        else:
            conv = model.features5[0].weight.data.numpy()[most - 200]
        print(*keywords[np.argmax(conv, 0)])
Exemple #6
0
def cross_validation(y, x, k_indices, k, lambda_, degree):

    # Dividing in subgroups
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)

    y_te = y[te_indice]
    y_tr = y[tr_indice]
    tx_te = x[te_indice]
    tx_tr = x[tr_indice]

    # Preprocessing data: cleaning, standardazing and adding constant column
    tx_tr, tx_te = process_data(tx_tr, tx_te, y_tr, y_te)

    # Feature augmentation through polynomials
    tx_tr = build_poly(tx_tr, degree)
    tx_te = build_poly(tx_te, degree)

    # Printing degree and lambda tested
    print("Test: d = ", degree, "; l = ", lambda_)

    # Training with ridge regression
    w, loss = ridge_regression(y_tr, tx_tr, lambda_)

    # Computing prediction vector
    y_pred = predict_labels(w, tx_te)

    # Computing accuracy on test set
    accuracy = compute_accuracy(y_te, y_pred)

    # Log informations
    print("Accuracy = ", accuracy, "; loss = ", loss, "\n")

    return loss_te, accuracy
Exemple #7
0
def main():
    data = process_data('test.txt')
    bar_plot_LEO(data)
    total_line_and_scatter_plot_LEO(data)
    total_bar_stacked_LEO(data)
    orbit_plot()
    compare_years_by_orbit(data)
    compare_by_alt(data)
Exemple #8
0
def main():

    df = dp.process_data('test.txt')

    df = probability_calc(df)

    df.to_csv('probability.csv')

    print('Probability Finished!')
Exemple #9
0
def main():
    df = pd.read_csv("GamingStudy_data.csv")
    data = data_processing.process_data(df)
    avg_GAD_over_20(data)
    avg_GAD_under_20(data)
    avg_hours_work(data)
    hours_game_age(data)
    narcissism_gaming_hours(data)
    narcissism_over_20_hours(data)
    narcissism_mental_health(data, 'GAD_T')
    narcissism_mental_health(data, 'SPIN_T')
    narcissism_mental_health(data, 'SWL_T')
Exemple #10
0
def predict(model, data):
    #df = pd.read_json(data)
    df = pd.DataFrame([data])
    X_top = dp.process_data(df)
    # with open('data/random_forest.pkl') as f:
    #     rf_top = pickle.load(f)

    # Using unpickled model to predict on new new data
    y_pred = model.predict_proba(X_top)

    # Adds new column in df with predicted probability of fraud
    df['fraud_prob'] = y_pred[:,1]
    return df
Exemple #11
0
def main(pd, gs):
    if pd:
        process_data('train.csv', 'test.csv')
    y_train_jets = []
    tx_train_jets = []
    ids_train_jets = []
    y_test_jets = []
    tx_test_jets = []
    ids_test_jets = []
    load_data_sets(y_train_jets, tx_train_jets, ids_train_jets, y_test_jets,
                   tx_test_jets, ids_test_jets)
    degree_best_jets = [6, 6, 6, 6]
    lambda_best_jets = [6e-05, 0.0023, 4.6e-09, 5.7e-05]
    if gs:
        perform_grid_search_with_cross_validation(degree_best_jets,
                                                  lambda_best_jets,
                                                  y_train_jets, tx_train_jets)
    predictions = []
    ids_predicted = []
    learn(predictions, ids_predicted, y_train_jets, tx_train_jets,
          tx_test_jets, ids_test_jets, lambda_best_jets, degree_best_jets)
    combine_and_create_submission(predictions, ids_predicted,
                                  'submit_E_M_D_best')
Exemple #12
0
def create_classification_models(loop_features, loop_targets, feature_names, main_params):
    loop_features, loop_targets, new_feature_names, new_feature_indices = process_data(loop_features, loop_targets,
                                                                                       feature_names,
                                                                                       main_params["data_processing"],
                                                                                       True)

    if np.shape(np.array(loop_features))[1] == 0:
        print("Canceling create_classification_models due to lack of features after data processing")
        return

    analyze_data(loop_features, loop_targets, new_feature_names, main_params["data_analysis"], False, True)

    # test_ratio = 1 - main_params["sampling_params"]["train_ratio"]
    # x_train, x_test, y_train, y_test = train_test_split(loop_features, loop_targets, test_size=test_ratio)

    run_sk_classification(loop_features, loop_targets, main_params, new_feature_indices, new_feature_names)
Exemple #13
0
    def test_shapes(self):
        proc = process_data()

        under_shape = proc.x_train_under.shape
        under_target_shape = (437, 3197)

        shrink_shape = proc.x_train_shrink.shape
        shrink_target_shape = (800, 3197)

        over_shape = proc.x_train_over.shape
        over_target_shape = (10100, 3197)

        # test if shape is correct
        self.assertEqual(under_shape, under_target_shape)
        self.assertEqual(shrink_shape, shrink_target_shape)
        self.assertEqual(over_shape, over_target_shape)
Exemple #14
0
def create_regression_models(loop_features, loop_targets, feature_names, main_params):
    loop_features, loop_targets, new_feature_names, new_feature_indices = process_data(loop_features, loop_targets,
                                                                                       feature_names,
                                                                                       main_params["data_processing"],
                                                                                       False)

    if np.shape(np.array(loop_features))[1] == 0:
        print("Canceling create_regression_models due to lack of features after data processing")
        return

    analyze_data(loop_features, loop_targets, new_feature_names, main_params["data_analysis"], False, False)

    x_train, x_test, y_train, y_test = utils.stratified_regressions_sampling(loop_features, loop_targets,
                                                                             main_params["sampling_params"])

    if main_params["data_processing"]["regression_smote"]:
        x_train, y_train = smote_oversampling_regression(x_train, y_train, new_feature_names)

    run_sk_regression(x_train, x_test, y_train, y_test, main_params, new_feature_indices, new_feature_names)
def mean_absolute_percentage_error(y_true, y_pred):
    import numpy as np
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


ks = [300, 300, 300]  #every other
#%%

MAPES = []
for k_prof in ks:
    for k_hash in ks:

        X, y = process_data('Business Analytics/training_set.csv',
                            k_prof=k_prof,
                            k_hash=k_hash,
                            training=True)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.33)
        model = XGBRegressor(colsample_bytree=0.5,
                             gamma=0.05,
                             max_depth=4,
                             min_child_weight=4,
                             n_estimators=1000,
                             subsample=0.6)

        model.fit(X_train, y_train)
        print("MAPE Train Score ")
        print(mean_absolute_percentage_error(y_train, model.predict(X_train)))
Exemple #16
0
    # create best words dictionary which key is suggestion and value is the probability of word
    best_words = {s: probs.get(s, 0) for s in suggestions}

    # get highest values in a best_words dictionary
    n_best = Counter(best_words).most_common(n)

    #n_best = [[s, probs[s]] for s in list(reversed(suggestions))]

    if verbose: print("entered word =", word, "\nsuggestions =", suggestions)

    return n_best


if __name__ == "__main__":
    # load corpus data
    word_l = data_processing.process_data("../input/shakespeare.txt")

    # create set of corpus words
    vocab = set(word_l)

    # get word frequencies
    word_count_dict = data_processing.get_count(word_l)

    # get probability of word in the corpus
    probs = data_processing.get_prob(word_count_dict)

    my_word = 'dys'

    tmp_corrections = get_corrections(my_word, probs, vocab, 2, verbose=True)

    for i, word_prob in enumerate(tmp_corrections):
Exemple #17
0
    return input_list_train, input_list_val, input_list_test


def logloss(y, pred):
    N = len(y)
    score = 0
    for i in range(N):
        score += y[i] * log(pred[i]) + (1 - y[i]) * log(1 - pred[i])
    return -score / N


## Network training ##

# Loading data
df_train = process_data()

print("First steps of the neural network... ")

cols = [c for c in df_train.columns if c not in ['is_churn', 'msno']]

X_train, y_train, X_test, y_test = train_test_split(df_train[cols],
                                                    df_train['is_churn'],
                                                    test_size=0.30,
                                                    random_state=242)

print(" X_train = ", X_train)
print("y_train = ", y_train)
print(" X_test = ", X_test)
print("y_test = ", y_test)
def main():
    df = process_data('test.txt')
    polynomial_fit_count(df)
    polynomial_fit_probability(df)
    get_orbit_tally(df, 'GEO')
from cnn import Net
import data_processing
from data_processing import process_data
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

train, test, val = process_data()
device = torch.device('cuda:0')
net = Net().to(device)

def train_model(net):

    X = torch.Tensor([i[0] for i in train]).view(-1, 98, 98).to(device)
    # Normalize data
    X = X / 255.0
    y = torch.Tensor([i[1] for i in train]).to(device)

    optimizer = optim.Adam(net.parameters(), lr=0.001)
    loss_function = nn.MSELoss()

    epochs = 2
    batch_size = 32

    for epoch in tqdm(range(epochs)):
        running_loss = 0.0
        for i in range(0, len(X), batch_size):
            batch_X = X[i: i+batch_size].view(-1, 1, 98, 98).to(device)
            batch_y = y[i: i+batch_size].to(device)
Exemple #20
0
#!/usr/bin/python

from FitnessCalc import FitnessCalc
from GeneticAlgorithm import GeneticAlgorithm
from data_processing import process_data

# initialize empty collection
summary = process_data('../resources/train.csv', 500, True)

# create GA instance
ga = GeneticAlgorithm()

# load data into fitness calculator
fc = FitnessCalc(summary)

#create initial population
ga.create_initial_population()

for generations in xrange(1, 51):
    max_fitness = 0.0
    print 'generation ' + str(generations)
    print 'max fitness: ' + str(max_fitness)
    #assign fitness levels
    total_fitness = 0.0
    for i, indiv in enumerate(ga.population):
        fitness = fc.calculate_fitness(indiv['chromosome'])
        total_fitness += fitness
        indiv['fitness'] = fitness
        print str(round(fitness, 2)) + ' ' + indiv['chromosome']
        print 'avg fitness: ' + str(float(total_fitness) / (i + 1))
from implementations import ridge_regression
from proj1_helpers import load_csv_data, predict_labels, create_csv_submission
from data_processing import process_data, build_poly

print("Loading data\n")

# Loading data from csv files
y_tr, tx_tr, ids_tr = load_csv_data("data/train.csv")
y_te, tx_te, ids_te = load_csv_data("data/test.csv")

# Hyper-parameters definitions
degree = 7
lambda_ = 0.00025

# Preprocessing data: cleaning, standardazing and adding constant column
tx_tr, tx_te = process_data(tx_tr, tx_te, y_tr, y_te)

# Feature augmentation through polynomials
tx_tr = build_poly(tx_tr, degree)
tx_te = build_poly(tx_te, degree)

# Training with ridge regression
print("Training the model\n")
weights, _ = ridge_regression(y_tr, tx_tr, lambda_)

# Computing prediction vector
y_pred = predict_labels(weights, tx_te)

# Creating file for submission
create_csv_submission(ids_te, y_pred, "prediction.csv")
        # Print progress
        print(f"Current iteration: {i+1}/{len(max_depth_array)}")

    # Plot
    plt.style.use('seaborn-whitegrid')
    plt.plot(max_depth_array, train_score, label="Train score")
    plt.plot(max_depth_array, valid_score, label="Validation score")
    plt.ylabel("Recall score", fontsize=14)
    plt.xlabel("Max depth", fontsize=14)
    plt.title("Overfitting decision tree on oversampled train and validation",
              fontsize=16)
    plt.legend()
    plt.savefig("../visuals/overfitting_dt_depth.pdf")
    plt.show()


if __name__ == "__main__":
    #getting training and testing data
    df = process_data(print_results=False)
    x_train_up = df.x_train_over  # Oversampled training data
    y_train_up = df.y_train_over  # Oversampled training target
    x_train_down = df.x_train_shrink  # Shrunk and oversampled training data
    y_train_down = df.y_train_shrink  # Shrunk and oversampled trainign target
    x_train = df.x_train  # Training data
    y_train = df.y_train  # Training targets
    x_test = df.x_test  # Test data
    y_test = df.y_test  # Test target

    tune_decision_tree()
Exemple #23
0
            'out/fly/acoustic-guitars.csv'))
    items.append(
        SearchItem(
            'https://www.fly-music.ro/21-chitare-chitari-chitara-bass-electrice-electrica',
            'out/fly/electric-bass.csv'))
    items.append(
        SearchItem('https://www.fly-music.ro/144-sintetizatoare-sintetizator',
                   'out/fly/organs.csv'))
    items.append(
        SearchItem(
            'https://www.fly-music.ro/80-chitare-chitari-chitara-electro-acustice--acustica-seturi',
            'out/fly/electro-acoustic-guitars.csv'))
    items.append(
        SearchItem('https://www.fly-music.ro/239-clape-midi-claviaturi-midi',
                   'out/fly/midi.csv'))

    crawler = FlyMusicCrawler(driver)
    for i in items:
        crawler.crawl_and_save(i.url, i.out_file)

    driver.quit()


if __name__ == '__main__':
    # crawl_mcmusic()
    # crawl_flymusic()
    # pack('out/mc')
    # pack('out/fly')

    process_data('out/merged/', 'mc.csv', 'fly.csv')
Exemple #24
0
 def setup_class(cls):
     cls.X_train, cls.X_test, cls.y_train, cls.y_test, cls.X_submission_df, cls.X_submission = process_data(
         "/home/spolezhaev/train", "/home/spolezhaev/test")
     """This method is run once for each class before any tests are run"""
Exemple #25
0
    train_df = train_df.sample(
        frac=FRAC_DATA, random_state=RANDOM_STATE).reset_index(drop=True)
    val_df = val_df.sample(frac=FRAC_DATA,
                           random_state=RANDOM_STATE).reset_index(drop=True)
    test_df = test_df.sample(frac=FRAC_DATA,
                             random_state=RANDOM_STATE).reset_index(drop=True)

    print(f"Using {int(FRAC_DATA*100)}% of the dataset.")

    NB_SPECIES = len(set(train_df['label']))  # Number of classes
    print("NB_SPECIES: ", NB_SPECIES)

    print("Processing Training Data...")
    trainloader = process_data(df=train_df,
                               batch_size=BATCH_SIZE,
                               sample_rate=SR,
                               audio_duration=AUDIO_DURATION,
                               random_state=RANDOM_STATE,
                               do_plot=False)

    print("Processing Validation Data...")
    validationloader = process_data(df=val_df,
                                    batch_size=BATCH_SIZE,
                                    sample_rate=SR,
                                    audio_duration=AUDIO_DURATION,
                                    random_state=RANDOM_STATE,
                                    do_plot=False)

    print("Processing Test Data...")
    testloader = process_data(df=test_df,
                              batch_size=1,
                              sample_rate=SR,
Exemple #26
0
    if (len(sys.argv) < 2) or (len(sys.argv) > 3):
        print("Usage:")
        print("\tmain.py data_path [pretrained_model_path]")
        sys.exit()

    DATA_PATH = sys.argv[1]
    MODEL_PATH = None
    if len(sys.argv) == 3:
        MODEL_PATH = sys.argv[2]

    data = pd.read_csv(DATA_PATH, sep='\t', header=None, names=['en', 'ru'])

    # Data preprocessing
    # Choose the sentences of word-length less than 14, eliminating only 1% of initial data
    max_sentence_length = 14
    data = process_data(data, max_sentence_length, SOS, EOS)

    tokenizer_en = tokenize_data(data.en, vocab_size=2**15)
    tokenizer_ru = tokenize_data(data.ru, vocab_size=2**15)

    encoder_max_length = max(
        [len(tokenizer_en.encode(sentence)) for sentence in data.en]) + 5
    decoder_max_length = max(
        [len(tokenizer_ru.encode(sentence)) for sentence in data.ru]) + 5

    X_train, X_test, y_train, y_test = train_test_split(np.array(data.en),
                                                        np.array(data.ru),
                                                        test_size=0.15,
                                                        random_state=15)

    # Model definition
Exemple #27
0
    update = [int(x) for x in update]
    freq = dict(Counter(update))
    update = sorted(list(set(update)))
    index = range(int(update[0]), int(update[-1]) + 1)
    freq_list = []
    for i in index:
        if i in freq:
            freq_list.append(freq[x])
        else:
            freq_list.append(0)
    print index, freq_list
    return index, freq_list


if __name__ == '__main__':
    X, Y = process_data('./hw1_15_train.dat')

    # Q15
    _, index_record, _ = naive_cyclic_PLA(X, Y)
    print 'question 15: updates: %d, index that results in max updates: %d' % (
        sum(index_record), index_record.argsort()[::-1][0])

    #Q16
    print 'question 16'
    update = []
    for i in range(2000):
        _, index_record, _ = naive_cyclic_PLA(X, Y, random_ord=True)
        total_update = sum(index_record)
        update.append(int(total_update))
    #update, freq = data_reorder(update)
    #histogram(update, freq, 'question16.png')
Exemple #28
0
    c_lst = np.logspace(-2, 2, 10)
    kernel_lst = ['linear', 'poly', 'sigmoid', 'rbf']
    gamma_lst = np.logspace(-3, 1, 10)

    for i in range(10):
        for j in range(10):
            print(c_lst[i])
            model = SVC(kernel='sigmoid',
                        gamma=gamma_lst[j],
                        C=c_lst[i],
                        probability=True)
            evaluate_model(model, x_train_down, x_test, y_train_down, y_test,
                           'baseline_CM_SVM',
                           'Baseline confusion matrix: SVM  ', 'svm_rp_name',
                           'svm_cm_name')


if __name__ == "__main__":
    #getting training and testing data
    df = process_data(print_results=False, plot=True)
    x_train_up = df.x_train_over  # Oversampled training data
    y_train_up = df.y_train_over  # Oversampled training target
    x_train_down = df.x_train_shrink  # Shrunk and oversampled training data
    y_train_down = df.y_train_shrink  # Shrunk and oversampled trainign target
    x_train = df.x_train  # Training data
    y_train = df.y_train  # Training targets
    x_test = df.x_test  # Test data
    y_test = df.y_test  # Test target

#    tune_SVM()
#seq2seq train
import tensorflow as tf
import numpy as np
import data_processing
import config
import data_utils
import seq2seq_wrapper
from os import path

#load data and split into train and test sets
idx_headings, idx_descriptions = data_processing.process_data()
article_metadata = data_processing.unpickle_articles()
(x_train, x_test), (y_train, y_test), (x_valid,
                                       y_valid) = data_utils.split_data(
                                           idx_descriptions, idx_headings)

#define parameters
xseq_length = x_train.shape[-1]
yseq_length = y_train.shape[-1]
batch_size = config.batch_size
xvocab_size = len(article_metadata['idx2word'])
yvocab_size = xvocab_size
checkpoint_path = path.join(config.path_outputs, 'checkpoint')

print(checkpoint_path)

#define model
model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_length,
                                yseq_len=yseq_length,
                                xvocab_size=xvocab_size,
                                yvocab_size=yvocab_size,
Exemple #30
0
import pandas as pd
from data_processing import process_data, get_truth_cat
from plot import plot_fig
import math
import numpy as np
from baseline import FixedBaseline, ClinicalBaseline
from lin_ucb import Env, LinUCB, ThompsonSampler, LinOracle, SupervisedBandit

features, feature_df, dosage = process_data('./data/warfarin.csv')
true_cat = get_truth_cat(dosage)
env = Env(features, true_cat, dosage)
clinical_baseline = ClinicalBaseline()
fixed_baseline = FixedBaseline()
lin_ucb = LinUCB(3, len(features[0]), 0.1)
lin_thompson = ThompsonSampler(3, len(features[0]), 0.01)
lin_oracle = LinOracle(3, features, true_cat)
supervised_bandit = SupervisedBandit(len(features[0]))

algo = {}
algo["clinical_baseline"] = clinical_baseline
algo["fixed_baseline"] = fixed_baseline
algo["lin_ucb"] = lin_ucb
algo["lin_thompson"] = lin_thompson
algo["lin_oracle"] = lin_oracle
algo["supervised_bandit"] = supervised_bandit
reward_list = {}
regret_list = {}
for i in algo:
    reward_list[i] = []
    if i != "lin_oracle":
        regret_list[i] = []
import tensorflow as tf
import numpy as np
import data_processing
import config
import data_utils
import seq2seq_wrapper
from os import path

#load data and split into train and test sets
idx_headings, idx_descriptions = data_processing.process_data()
article_metadata = data_processing.unpickle_articles()
(x_train, x_test), (y_train, y_test), (x_valid, y_valid) = data_utils.split_data(idx_descriptions, idx_headings)

#define parameters
xseq_length = x_train.shape[-1]
yseq_length = y_train.shape[-1]
batch_size = config.batch_size
xvocab_size = len(article_metadata['idx2word'])
yvocab_size = xvocab_size
checkpoint_path = path.join(config.path_outputs, 'checkpoint')

print (checkpoint_path)

#define model
model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_length,
                                yseq_len=yseq_length,
                                xvocab_size=xvocab_size,
                                yvocab_size=yvocab_size,
                                emb_dim=config.embedding_dim,
                                num_layers=3,
                                ckpt_path=checkpoint_path)
Exemple #32
0
import numpy as np
from data_processing import process_data, generate_data, histogram
from Decision_stump import one_dimension_decision_stump, multi_dimension_decision_stump, check_accuracy,Out_of_sample_error


if __name__ == '__main__':
    #Q17, Q18
    E_in_list = []
    E_out_list = []
    for i in range(5000):
        X, Y = generate_data(10, 5)
        score, s, theta = one_dimension_decision_stump(X, Y)
        E_in_list.append((10-float(score))/10)
        E_out_list.append(Out_of_sample_error(s, theta))
    histogram(E_in_list, 'qustion 17', 'in sample error', 'frequency')
    print "Question 17: average in sample error: %f" % (sum(E_in_list)/5000)
    histogram(E_out_list, 'qustion 18', 'out of sample error', 'frequency')
    print "Question 18: average out of sample error: %f" % (sum(E_out_list)/5000)
    
    #Q19
    X_train, Y_train = process_data('./hw2_train.dat')
    X_test, Y_test = process_data('./hw2_test.dat')
    best_record, s, theta, index = multi_dimension_decision_stump(X_train, Y_train)
    print "Qustion 19: index: %d, h = %d * sign(x - %f), in sample error: %f" % (index, s, theta, (len(Y_train)-float(best_record))/len(Y_train))
    X_test_trans = np.transpose(X_test)
    accuracy = check_accuracy(s, theta, X_test_trans[index], Y_test)
    print "Qustion 20: out of sample error: %f" % (1 - accuracy)