Example #1
0
 def test_load_data(self):
     train_df, valid_df = load_train_data(self.audio_path,
                                          self.validation_list_path)
     self.assertTrue(train_df.shape[0] == 57929)
     self.assertTrue(valid_df.shape[0] == 6798)
     df = train_df.groupby('label').apply(sampling(2000))
     print(df.shape)
def _make_submission(params):
    test_paths = glob(params['test_path'])
    if params['sample']:
        print("Get small sample")
        test_paths = test_paths[:params['sample_size']]
    model = load_model(params['model_path'])
    train_data, validate_data = load_train_data(params['audio_path'], params['validation_list_path'])
    assert len(train_data) != 0
    assert len(validate_data) != 0

    wav_reader = SimpleWavFileReader(L)
    silence_data = get_silence(train_data, wav_reader)
    sound_chain = SoundChain(
        SimpleWavFileReader(L),
        sp.AdjustLenWavProcessor(silence_data, L, L),
        sp.EmphasisWavProcessor(silence_data, L, L, 0.97),
        sp.NormalizeWavProcessor(silence_data, L, L),
        sp.ReshapeWavProcessor(silence_data, L, L),
        sp.MinMaxWavProcessor(silence_data, L, L, (0, 1)),
    )

    tests = test_generator(test_paths, params['batch_size_pred'], sound_chain)
    print("PREDICTING")
    predictions = model.predict_generator(tests, int(np.ceil(len(test_paths) / params['batch_size_pred'])))
    classes = np.argmax(predictions, axis=1)
    submission = {}
    print("SAVING")
    for i in range(len(test_paths)):
        fname, label = os.path.basename(test_paths[i]), id2name[classes[i]]
        submission[fname] = label

    return submission
def main_confusion_matrix(params):
    model = load_model(params['model_path'])
    train_df, valid_df = load_train_data(params['audio_path'], params['validation_list_path'])
    assert len(train_df) != 0
    assert len(valid_df) != 0

    wav_reader = SimpleWavFileReader(L)
    silence_data = get_silence(train_df, wav_reader)
    sound_chain = SoundChain(
        SimpleWavFileReader(L),
        sp.AdjustLenWavProcessor(silence_data, L, L),
        sp.EmphasisWavProcessor(silence_data, L, L, 0.97),
        sp.NormalizeWavProcessor(silence_data, L, L),
        sp.ReshapeWavProcessor(silence_data, L, L),
        sp.MinMaxWavProcessor(silence_data, L, L, (0, 1)),
    )

    validate_gen = valid_generator(valid_df, params['batch_size'], sound_chain, with_y=False)
    predictions = model.predict_generator(validate_gen, int(np.ceil(valid_df.shape[0] / params['batch_size_pred'])))
    classes = [id2name[i] for i in np.argmax(predictions, axis=1)]
    y_true = valid_df['label'].values
    labels = np.unique(valid_df['label'].values)
    cm = confusion_matrix(y_true, classes, labels=labels)
    df = pd.DataFrame(cm, columns=labels, index=labels)
    df.to_csv(os.path.join(params['output_path'], 'confusion.csv'), index_label='index')
    print(df)
    return df
def train_model(training_iterations, batch_size, train_data_file):
	accuracy_list, entropy_list = [], []
	x_data, y_data = load_train_data(train_data_file, train_with_only_known_age_data)
	#print("Length of input data is: ", len(x_data))
	start_time = time.time()
	saver = tf.train.Saver()
	for i in range(training_iterations):
		x_batch, y_batch = get_batch(x_data, y_data, batch_size)
		training_data = {x: x_batch, y_: y_batch}
		accrcy, entropy = sess.run([accuracy, cross_entropy], feed_dict=training_data)

		#Backpropagation
		sess.run(train_step, feed_dict=training_data)
		accuracy_list.append(accrcy)
		entropy_list.append(entropy)

		# Saving checkpoints to load trained model later
		directory = "checkpoints/trained_model"
		if not os.path.exists(directory): os.makedirs(directory)
		saver.save(sess, directory, global_step=checkpoint_every)

		# printing the training performance
		if i % 100 == 0:
			print("Accuracy after %s training steps is: %s" % (i, accrcy))
	print("")
	print("Training process is done in time: ", time.time() - start_time, "seconds.")
	return accuracy_list, entropy_list		
Example #5
0
def main():
    FNAME = "model_train_lgbm"
    logname = "%s_%s.log" % (FNAME, now)
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    # Load raw data
    train_raw = dl.load_train_data()
    # Load generated features
    train_features = load_combined_features(logger)

    train_column_names = list(train_features.columns.values)
    logger.info("Training set column names: " + str(train_column_names))

    # train_features = pd.concat([train_features, train_raw[config.NUMBER_FEATURES]], axis=1)
    logger.info('Final training data shape: %s' % str(train_features.shape))

    x_train, x_valid, y_train, y_valid = train_test_split(
        train_features,
        train_raw[config.TARGET_FEATURE],
        test_size=0.20,
        random_state=42)
    del train_raw
    del train_features
    gc.collect()
    lgtrain = lgb.Dataset(x_train,
                          label=y_train,
                          feature_name=train_column_names,
                          categorical_feature=config.ENCODED_CATEGORY_FEATURES)
    lgvalid = lgb.Dataset(x_valid,
                          label=y_valid,
                          feature_name=train_column_names,
                          categorical_feature=config.ENCODED_CATEGORY_FEATURES)

    t0 = time()
    lightgbm_model = lgb.train(
        config.LGBM_PARAMS,
        lgtrain,
        config.LGBM_NUM_ROUNDS,
        valid_sets=lgvalid,
        verbose_eval=50,
        early_stopping_rounds=config.LGBM_EARLY_STOPPING_ROUNDS)
    logger.info('Training LightGBM model took: %s minutes' % round(
        (time() - t0) / 60, 1))

    # Save model
    t0 = time()
    MODEL_FILE_NAME = "lightgbm_model"
    model_file = os.path.join(config.DATA_MODELS_DIR,
                              MODEL_FILE_NAME + config.FEAT_FILE_SUFFIX)
    logger.info("Save to %s" % model_file)
    lightgbm_model.save_model(model_file,
                              num_iteration=lightgbm_model.best_iteration)
    logger.info('Saving %s lightgbm model took: %s minutes' %
                (MODEL_FILE_NAME, round((time() - t0) / 60, 1)))

    generate_figure_importance(lightgbm_model, logger)
def main_train(params, clf: Type[Classifier]):
    model = clf(L, LABELS)
    name = "{}--{}".format(model.name, int(datetime.now().timestamp()))
    print(params)
    chekpoints_path = os.path.join(params['output_path'], name + '_weights')
    os.makedirs(chekpoints_path, exist_ok=True)
    batch_size = params['batch_size']
    n = params['sample_size']

    train_data, validate_data = load_train_data(params['audio_path'],
                                                params['validation_list_path'])
    assert len(train_data) != 0
    assert len(validate_data) != 0

    wav_reader = SimpleWavFileReader(L)
    silence_data = get_silence(train_data, wav_reader)

    train_sound_chain = SoundChain(
        SimpleWavFileReader(L),
        sp.AdjustLenWavProcessor(silence_data, L, L),
        sp.AddNoiseWavProcessor(silence_data, L, L, 20),
        sp.ShiftWavProcessor(silence_data, L, L),
        sp.EmphasisWavProcessor(silence_data, L, L, 0.97),
        sp.NormalizeWavProcessor(silence_data, L, L),
        sp.ReshapeWavProcessor(silence_data, L, L),
        sp.MinMaxWavProcessor(silence_data, L, L, (0, 1)),
    )

    valid_sound_chain = SoundChain(
        SimpleWavFileReader(L),
        sp.AdjustLenWavProcessor(silence_data, L, L),
        sp.EmphasisWavProcessor(silence_data, L, L, 0.97),
        sp.NormalizeWavProcessor(silence_data, L, L),
        sp.ReshapeWavProcessor(silence_data, L, L),
        sp.MinMaxWavProcessor(silence_data, L, L, (0, 1)),
    )

    if params['sample']:
        print("Get small sample")
        train_data, validate_data = get_sample_data(train_data, validate_data,
                                                    n)

    train_gen = train_generator(train_data, batch_size, train_sound_chain, n=n)
    validate_gen = valid_generator(validate_data, batch_size,
                                   valid_sound_chain, True)

    model.train(
        train_gen, validate_gen,
        dict(epochs=params['epochs'],
             batch_size=batch_size,
             tensorboard_dir=os.path.join(params['tensorboard_root'], name),
             chekpoints_path=chekpoints_path,
             steps_per_epoch=n * len(LABELS) / batch_size,
             validation_steps=int(np.ceil(validate_data.shape[0] /
                                          batch_size))))
def load_data(train_data_path='./aclImdb/train/', test_data_path='./aclImdb/test/'):
    # Load data
    print("Load Data...")
    Xtr_text, Ytr, Xva_text, Yva = load_train_data(train_data_path, 0.1)
    Xte_text, Yte = load_test_data(test_data_path)

    # Combine training and validation data:
    Xtr_text = np.append(Xtr_text, Xva_text)
    Ytr = np.append(Ytr, Yva)
    print("Done loading data!\n")

    return Xtr_text, Ytr, Xte_text, Yte
def train_model(training_iterations, batch_size, train_data_file):
    accuracy_list, cross_entropy_list = [], []
    xs_data, ys_data = load_train_data(train_data_file, True)
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        for i in range(training_iterations):
            x_batch, y_batch = get_batch(xs_data, ys_data, batch_size)
            training_data = {x: x_batch, y_: y_batch}
            accrcy, s_cross = sess.run([accuracy, cross_entropy],
                                       feed_dict=training_data)

            #Backpropagation
            sess.run(train_step, feed_dict=training_data)
            accuracy_list.append(accrcy)
            cross_entropy_list.append(s_cross)
        return accuracy_list, cross_entropy_list
Example #9
0
 def test_train_generator(self):
     train_df, valid_df = load_train_data(self.audio_path,
                                          self.validation_list_path)
     wav_reader = SimpleWavFileReader(L)
     silence_data = get_silence(train_df, wav_reader)
     train_sound_chain = SoundChain(
         SimpleWavFileReader(L),
         sp.AdjustLenWavProcessor(silence_data, L, L),
         sp.EmphasisWavProcessor(silence_data, L, L, 0.97),
         sp.NormalizeWavProcessor(silence_data, L, L),
         sp.ReshapeWavProcessor(silence_data, L, L),
         sp.MinMaxWavProcessor(silence_data, L, L, (0, 1)),
     )
     n = 2
     gen = train_generator(train_df, 64, train_sound_chain, n)
     batch = gen.__next__()
     self.assertEqual(batch[0].shape, (len(LABELS) * n, L, 1))
     self.assertEqual(batch[1].shape, (len(LABELS) * n, len(LABELS)))
def load_train_data_with_dictionary(file_path, freq_threshold=0):
    dictionary = {"PADDING_TOKEN": 0, "UNKNOWN_TOKEN": 1}
    word_freq = collections.defaultdict(int)

    train_df, valid_df = data_loader.load_train_data(file_path)
    train_doc, valid_doc = [], []
    for idx, row in train_df.iterrows():
        doc = process_row(row)

        train_doc.append(doc)
        for token in doc.words:
            word_freq[token] += 1

    for word, freq in word_freq.items():
        if freq >= freq_threshold:
            dictionary[word] = len(dictionary)

    for idx, row in valid_df.iterrows():
        doc = process_row(row)
        valid_doc.append(doc)

    return train_doc, valid_doc, dictionary
Example #11
0
def train():
    init_seed(111)
    split_num = 720
    train_num = 1600
    val_num = 100
    train_indices = torch.randperm(1700)[:train_num]
    val_indices = torch.randperm(1700)[:val_num]
    train_numerical = np.array([0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14])
    val_numerical = np.array([6])
    # train_numerical = np.array([])
    # val_numerical = np.array([])

    train_datasets = [
        Subset(load_train_data('cmip', which_num=num), train_indices)
        for num in train_numerical
    ]
    train_datasets.append(load_train_data('soda', split_num=split_num))
    print('Training Samples: {}'.format(
        len(train_numerical) * train_num + split_num))
    valid_datasets = [
        Subset(load_val_data('cmip', which_num=num), val_indices)
        for num in val_numerical
    ]
    valid_datasets.append(load_val_data('soda', split_num=split_num + 60))
    print('Validation Samples: {}'.format(
        len(val_numerical) * val_num + 1200 - split_num))

    train_loaders = [
        DataLoader(train_dataset, batch_size=args['batch_size'])
        for train_dataset in train_datasets
    ]
    valid_loaders = [
        DataLoader(valid_dataset, batch_size=args['batch_size'])
        for valid_dataset in valid_datasets
    ]

    device = args['device']
    model = args['model_list'][args['model_name']]()
    print_model_parameters(model)

    if args['pretrain'] and os.path.exists(save_dir):
        model.load_state_dict(torch.load(save_dir, map_location=device))
        print('load model from:', save_dir)

    optimizer = torch.optim.AdamW(model.parameters(), lr=args['learning_rate'])
    if args['lr_decay']:
        print('Applying learning rate decay.')
        lr_decay_steps = [int(i) for i in args['lr_decay_step']]
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer=optimizer,
            milestones=lr_decay_steps,
            gamma=args['lr_decay_rate'])
    else:
        lr_scheduler = None
    loss_fn = nn.MSELoss().to(device)
    # loss_fn = score_loss

    model.to(device)

    best_score = float('-inf')
    not_improved_count = 0

    for i in range(args['n_epochs']):
        model.train()
        loss_epoch = 0
        for train_loader in train_loaders:
            for step, (sst, t300, ua, va, label, month,
                       sst_label) in enumerate(train_loader):
                sst = sst.to(device).float()
                t300 = t300.to(device).float()
                ua = ua.to(device).float()
                va = va.to(device).float()
                # month = month[:, :1].to(device).long()
                month = month.to(device).long()
                label = label.to(device).float()
                optimizer.zero_grad()

                preds = model(sst, t300, ua, va, month)
                loss = loss_fn(preds, label)
                loss.backward()
                loss_epoch += loss.item()
                if args['grad_norm']:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args['max_grad_norm'])

                optimizer.step()
                del preds, loss
        if args['lr_decay']:
            lr_scheduler.step()
        model.eval()
        y_true, y_pred = [], []
        for valid_loader in valid_loaders:
            for step, (sst, t300, ua, va, label, month,
                       sst_label) in enumerate(valid_loader):
                sst = sst.to(device).float()
                t300 = t300.to(device).float()
                ua = ua.to(device).float()
                va = va.to(device).float()
                month = month.to(device).long()
                label = label.to(device).float()
                preds = model(sst, t300, ua, va, month)
                y_pred.append(preds.detach())
                y_true.append(label.detach())
                del preds

        y_true = torch.cat(y_true, axis=0)
        y_pred = torch.cat(y_pred, axis=0)
        sco = eval_score(y_true.cpu().detach().numpy(),
                         y_pred.cpu().detach().numpy())
        print('Epoch: {}, Train Loss: {}, Valid Score: {}'.format(
            i + 1, loss_epoch, sco))
        if sco > best_score:
            best_score = sco
            not_improved_count = 0
            best_state = True
        else:
            not_improved_count += 1
            best_state = False

        if not_improved_count == args['early_stop_patience']:
            print("Validation performance didn\'t improve for {} epochs. "
                  "Training stops.".format(args['early_stop_patience']))
            break
        if best_state:
            best_model = deepcopy(model.state_dict())
            torch.save(best_model, save_dir)
            # torch.save(model, '../user_data/ref.pkl')
            print('Model saved successfully:', save_dir)
Example #12
0
import augmentation_methods as am
import data_loader as dl
import word_vectors as wv  
import data_preprocessing as dp 
import classifier as cl   
import testing as t
import visualization as vis 

if __name__ == "__main__":
    # get original data in tokenized form
    orig_corpus, y_train_orig = dl.load_train_data()
    test_corpus, y_test_orig = dl.load_test_data()

    # develop word vectors
    word_vectors = wv.get_word_vectors(orig_corpus)

    # augment corpi
    corpus_method_1, y_train_method_1 = am.method_1(orig_corpus.copy(), y_train_orig.copy(), word_vectors)
    corpus_method_2, y_train_method_2 = am.method_2(orig_corpus.copy(), y_train_orig.copy(), word_vectors)
    corpus_method_3, y_train_method_3 = am.method_3(orig_corpus.copy(), y_train_orig.copy(), word_vectors)

    # process data so they are in a form(td-idf) that can be fed to classifiers
    X_orig, vectorizer = dp.process_corpus_orig(orig_corpus)
    X_method_1 = dp.process_corpus(corpus_method_1, vectorizer)
    X_method_2 = dp.process_corpus(corpus_method_2, vectorizer)
    X_method_3 = dp.process_corpus(corpus_method_3, vectorizer)
    X_test = dp.process_corpus(test_corpus, vectorizer)

    # train classifiers on original corpus and all augmented corpi
    classifier_orig = cl.train_classifier_bayes(X_orig, y_train_orig)
    classifier_method_1 = cl.train_classifier_bayes(X_method_1, y_train_method_1)
Example #13
0
import h5py
import numpy as np
from nltk.corpus import stopwords
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.stem.wordnet import WordNetLemmatizer
import data_loader
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
# %matplotlib inline
plt.set_cmap('RdYlBu')
import pre_processing

train, valid = data_loader.load_train_data('data/train.csv')
test = data_loader.load_test_data('data/test.csv', 'data/test_labels.csv')

list_classes = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]

train_y = train[list_classes].values
valid_y = valid[list_classes].values
test_y = test[list_classes].values

train = train.fillna('')
valid = valid.fillna('')
test = test.fillna('')
"""## Data Exploration"""
def main():
    image_size = 50
    number_of_classes = 12

    # cached_files = os.listdir('cache/')

    # if no cached features and labels exist locally create them and then cache them
    # if 'train_features.csv' not in cached_files or 'train_labels.csv' not in cached_files:

    features, labels, categories = load_train_data(train_data_path='./data/train/',
                                                   image_size=image_size)

    # TODO create a fast caching system
    # np.savetxt('cache/train_features.csv', train_features, delimiter=',', fmt='%.4f')
    # np.savetxt('cache/train_labels.csv', train_labels, delimiter=',', fmt='%i')

    # # if cached features and labels are detected load them into variables
    # else:
    #     train_features = np.genfromtxt('cache/train_features.csv', delimiter=',')
    #     print('training features loaded from cache')
    #     train_labels = np.genfromtxt('cache/train_labels.csv', delimiter=',')
    #     print('training labels loaded from cache')

    binary_training_labels = keras.utils.to_categorical(labels, num_classes=number_of_classes)

    train_features, train_labels, crosval_features, crosval_labels, test_features, test_labels = \
        split_data(features, binary_training_labels, train_fraction=0.9, crosval_fraction=0.0, test_fraction=0.1)

    reg_value = 0.02

    # building nn topology
    model = Sequential()
    model.add(Dense(units=2500,
                    activation='relu',
                    input_dim=image_size ** 2,
                    kernel_regularizer=regularizers.l2(reg_value)))

    model.add(Dense(units=300,
                    activation='relu',
                    kernel_regularizer=regularizers.l2(reg_value)))

    model.add(Dense(units=300,
                    activation='relu',
                    kernel_regularizer=regularizers.l2(reg_value)))

    model.add(Dense(units=number_of_classes,
                    activation='sigmoid',
                    kernel_regularizer=regularizers.l2(reg_value)))

    model.compile(loss='categorical_crossentropy',
                  optimizer='sgd',
                  metrics=['accuracy'])

    # training_epochs = 200
    # model.fit(train_features, train_labels, epochs=training_epochs, batch_size=100)

    epoch = 0

    # hold historical training and test accuracy
    train_accuracy = {}
    test_accuracy = {}

    try:
        while epoch < 2000:
            model.fit(train_features, train_labels, epochs=1, batch_size=128)
            test_accuracy[epoch] = model.evaluate(test_features, test_labels, batch_size=128)[1]
            train_accuracy[epoch] = model.evaluate(train_features, train_labels, batch_size=128)[1]

            # TODO add sequential model saving

            print('\nepoch = %i\n' % epoch)

            epoch += 1

    except KeyboardInterrupt:
        pass

    # plotting training and test accuracy histories
    plt.plot(train_accuracy.keys(), train_accuracy.values(), label='train')
    plt.plot(test_accuracy.keys(), test_accuracy.values(), label='test')
    axes = plt.gca()
    # axes.set_ylim([0.8, 0.90])
    plt.xlabel('epoch')
    plt.ylabel('accuracy')
    plt.legend()
    plt.show()

    test_accuracy = model.evaluate(test_features, test_labels, batch_size=1000)[1]
    print('trained model accuracy on test set = %f' % test_accuracy)
    acc = 0
    acc_freq = []
    freq_list = []
    for freq, c in sorted(freq_counter.items()):
        if freq > 200:
            break
        freq_list.append(freq)
        acc += c
        acc_freq.append(acc / float(total_count))

    plt.plot(freq_list, acc_freq, label=name)
    plt.title("Token frequency distribution of train data")
    plt.ylabel("cutoff proportion")
    plt.xlabel("cutoff token frequency")
    plt.legend()
    plt.savefig("token_frequency_cutoff_f200_{}.png".format(name))
    plt.close()


if __name__ == '__main__':
    train_df, valid_df = data_loader.load_train_data("./resources/train.csv")
    label_distribution(train_df, name="train")
    label_distribution(valid_df, name="valid")

    sentence_length_distribution()

    token_frequency_distribution()

    label_correlation()
Example #16
0
# In[15]:

import data_loader
import numpy as np
import pandas

# In[16]:

import warnings
warnings.filterwarnings('ignore')

# In[17]:

# load data and assign names
trdf, valdf = data_loader.load_train_data("data/adult.data", is_df=True)
## adding columns labels https://chartio.com/resources/tutorials/how-to-rename-columns-in-the-pandas-python-library/
trdf.columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country",
    "target"
]
valdf.columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country",
    "target"
]

# # Pipelines
    else:
        print("ERROR: Invalid algorithm name", args.algo)

    print("Algorithm:", args.algo, "| Scenario:", args.scenario)

    param_object = ParameterGrid(tuned_parameters)

    NUMBER_OF_FOLDS = 5
    skf = StratifiedKFold(
        n_splits=NUMBER_OF_FOLDS)  # Splits the data into stratified folds
    NGRAMS = [(1, 1), (2, 2), (3, 3),
              (1, 3)]  # Unigrams, Bigrams, Trigrams, UniBiTri_combined
    MAX_FEATURES = [None, 1000, 100000]

    X, Y = data_loader.load_train_data(scenario=args.scenario,
                                       N_WORDS=PASSAGE_LENGTH,
                                       exp=experiment)
    print("X: {} | Y: {} | Distribution: {}".format(len(X), len(Y),
                                                    Counter(Y)))
    print("Y preview:", Y[:3])

    results_path = '/path/Augmentation-for-Literary-Data/results/' + args.algo + '-params-' + str(
        PASSAGE_LENGTH
    ) + '/' + args.algo + '_' + str(NUMBER_OF_FOLDS) + 'foldCV_Case_' + str(
        args.scenario) + '_exp' + experiment + '.tsv'  # name of output file
    print("\n-------\nResults path:", results_path, "\n\n")
    results_file = open(results_path, "w")
    results_file.write(
        "Model\tF1-score\tAUROC\tWeighted F1\tPrecision\tRecall\tAccuracy\tAUPRC\tParameters\n"
    )
    run_experiments(X, Y)
Example #18
0
def main():
    if len(sys.argv) != 2 or sys.argv[1] not in ['svm', 'nn']:
        print("Invalid command. Expected 'svm' or 'nn'.")
        return

    c_name = sys.argv[1]
    print('Running job: TF-IDF vectorization and ' + c_name.upper() +
          ' classifier.')

    train_data = data_loader.load_train_data().sample(
        frac=1, random_state=42).reset_index(drop=True)
    test_data = data_loader.load_test_data()

    if c_name == 'svm':
        classifier = LinearSVC(random_state=42)
        param_grid = {
            'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
            'classifier__C': [0.1, 1]
        }
    else:
        classifier = MLPClassifier((50, ),
                                   solver='lbfgs',
                                   learning_rate_init=1e-4,
                                   tol=1e-6,
                                   max_iter=200,
                                   random_state=42)
        param_grid = {'vectorizer__ngram_range': [(1, 1)]}

    pipe = Pipeline([('vectorizer', TfidfVectorizer()),
                     ('classifier', classifier)])

    cv_grid = GridSearchCV(pipe,
                           n_jobs=2,
                           cv=5,
                           verbose=3,
                           param_grid=param_grid)

    start_time = time.time()
    cv_grid.fit(train_data.text, train_data.sentiment)
    end_time = time.time()
    print('Total fit time: {}'.format(end_time - start_time))

    # Classification report
    pred = cv_grid.predict(train_data.text)
    cr = classification_report(train_data.sentiment, pred)
    print(cr)

    # Test predictions
    pred = cv_grid.predict(test_data.text)
    print('Predictions finished.')

    # Save predictions
    results = pd.DataFrame({'Id': test_data.index, 'Prediction': pred})
    results = results.set_index('Id')
    data_loader.save_submission(results,
                                'tfidf_' + c_name.upper() + '_submission.csv')
    print('Predictions saved.')

    # Save classification results
    cvr_path = path.join(
        'pickles', 'tfidf_' + c_name.upper() +
        '_cross_validation_results')  # Cross validation results
    be_path = path.join('pickles', 'tfidf_' + c_name.upper() +
                        '_best_estimator')  # Best estimator

    dump(cv_grid.cv_results_, open(cvr_path, 'wb'))
    dump(cv_grid.best_estimator_, open(be_path, 'wb'))
    print('Classification results saved.')
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, VotingClassifier
# load data (provided method)
train_data, valid_data = data_loader.load_train_data('Data/adult.data',
                                                     valid_rate=0.1,
                                                     is_df=True)
test_data = data_loader.load_test_data('Data/adult.test', is_df=True)

#update fields
native_country_dict = {
    ' ?': '?',
    ' Cambodia': 'Africa',
    ' Canada': 'North America',
    ' China': 'Asia',
    ' Columbia': 'Latin America',
    ' Cuba': 'Latin America',
    ' Dominican-Republic': 'Latin America',
    ' Ecuador': 'Latin America',
    ' El-Salvador': 'Latin America',
    ' England': 'Europe',
    parser.add_argument(
        '--cda', help='Use CDA for Data Augmentation',
        action="store_true")  # Use CDA; default: no Data Augmentation

    args = parser.parse_args()

    # Load training data:
    if args.eda:  # with EDA
        X_train, Y_train = data_loader.load_train_data_with_EDA(
            scenario=args.scenario)

    elif args.cda:  # with CDA
        pass

    else:  # wihtout any Data Augmentation
        X_train, Y_train = data_loader.load_train_data(args.scenario)

    X_train = X_train.tolist()
    Y_train = Y_train.tolist()  # convert to list
    labels_train = labels_str_to_int(Y_train)  # convert labels to integers

    # Test data:
    X_test, Y_test, test_IDs = data_loader.load_test_data()
    X_test = X_test.tolist()
    Y_test = Y_test.tolist()
    test_IDs = test_IDs.tolist()  # convert to list
    labels_test = labels_str_to_int(Y_test)  # convert labels to integers
    testIDs_idx = np.linspace(
        0, len(test_IDs), len(test_IDs), False
    )  # can't create a tensor of strings, so create a corresponding list of indexes; we use that to index into test_IDs
    print("testIDs indexes:", len(testIDs_idx))
Example #21
0
        # Load training data:
        if args.eda and not args.cda:  # only EDA
            train_passages, Y_train = data_loader.load_train_data_with_EDA(
                scenario=args.scenario)

        elif args.cda and not args.eda:  # only CDA
            train_passages, Y_train = data_loader.load_train_data_with_CDA(
                scenario=args.scenario)

        elif args.eda and args.cda:  # both EDA and CDA
            train_passages, Y_train = data_loader.load_train_data_with_EDA_and_CDA(
                scenario=args.scenario)

        else:  # wihtout any Data Augmentation
            train_passages, Y_train = data_loader.load_train_data(
                scenario=args.scenario,
                N_WORDS=DOCUMENT_LENGTH,
                exp=experiment)

        print("\nTrain Set ---- X: {} | Y: {} | Distribution: {}".format(
            len(train_passages), len(Y_train), Counter(Y_train)))
        print("Y train preview:", Y_train[:3])

        # Load test data (same for each scenario, with or without augmentation):
        test_passages, Y_test, test_IDs = data_loader.load_test_data(
            N_WORDS=DOCUMENT_LENGTH)
        print(
            "Test Set ---- X: {} | Y: {} | Distribution: {} | Test IDs: {}, preview: {}"
            .format(len(test_passages), len(Y_test), Counter(Y_test),
                    len(test_IDs), test_IDs[:3]))
        print("Y test preview:", Y_test[:3])
Example #22
0
import model
import data_loader
import torch

train_loader, neg_loader, neg_len = data_loader.load_train_data()
g = model.Generator()
d = model.Discriminator()
g_optim = torch.optim.Adam(g.parameters(), lr=0.001, weight_decay=0)
d_optim = torch.optim.Adam(d.parameters(), lr=0.001, weight_decay=0)
model.train(g, d, train_loader, neg_loader, 100, g_optim, d_optim, neg_len)
Example #23
0
def load_train_data_from_file(train_file_path):
    df, _ = load_train_data(train_file_path, 0)
    df.replace(' ?', np.nan, inplace=True)
    df = df.dropna()
    return df
Example #24
0
def train():
    init_seed(1995)
    sample_num = 150
    indices = torch.randperm(1700)[:sample_num]
    train_numerical = np.array([1, 2, 4, 5, 7, 8, 10, 11, 13, 14, 15]) - 1
    val_numerical = np.array([3, 6, 9, 12]) - 1
    train_datasets = [
        Subset(load_train_data('cmip', which_num=num), indices)
        for num in train_numerical
    ]
    print('Training Samples: {}'.format(len(train_numerical) * sample_num))
    valid_datasets = [
        Subset(load_val_data('cmip', which_num=num), indices)
        for num in val_numerical
    ]
    print('Validation Samples: {}'.format(len(val_numerical) * sample_num))
    train_loaders = [
        DataLoader(train_dataset, batch_size=args['batch_size'])
        for train_dataset in train_datasets
    ]
    valid_loaders = [
        DataLoader(valid_dataset, batch_size=args['batch_size'])
        for valid_dataset in valid_datasets
    ]

    device = args['device']
    model = args['model_list'][args['model_name']]()
    if args['pretrain'] and os.path.exists(save_dir):
        model.load_state_dict(torch.load(save_dir, map_location=device))
        print('load model from:', save_dir)

    optimizer = torch.optim.Adam(model.parameters(), lr=args['learning_rate'])
    loss_fn = nn.MSELoss()

    model.to(device)
    loss_fn.to(device)
    print_model_parameters(model)

    best_score = float('-inf')
    not_improved_count = 0

    for i in range(args['n_epochs']):
        model.train()
        loss_epoch = 0
        for cmip_num, train_loader in enumerate(train_loaders):
            loss_numodel = 0
            for step, (sst, t300, ua, va, label,
                       sst_label) in enumerate(train_loader):
                sst = sst.to(device).float()
                t300 = t300.to(device).float()
                ua = ua.to(device).float()
                va = va.to(device).float()
                # sst_label = sst_label.to(device).float()
                optimizer.zero_grad()
                label = label.to(device).float()
                # output, preds = model(sst, t300, ua, va)
                preds = model(sst, t300, ua, va)
                loss1 = loss_fn(preds, label)
                # loss2 = loss_fn(output, sst_label)
                # loss = loss1 + loss2
                loss1.backward()
                loss_numodel += loss1.item()
                if args['grad_norm']:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args['max_grad_norm'])

                optimizer.step()
                del preds, loss1
            loss_epoch += loss_numodel
            print('numerical model {} loss: {}'.format(cmip_num, loss_numodel))

        model.eval()
        y_true, y_pred = [], []
        for valid_loader in valid_loaders:
            for step, (sst, t300, ua, va, label,
                       sst_label) in enumerate(valid_loader):
                sst = sst.to(device).float()
                t300 = t300.to(device).float()
                ua = ua.to(device).float()
                va = va.to(device).float()
                label = label.to(device).float()
                preds = model(sst, t300, ua, va)
                y_pred.append(preds.detach())
                y_true.append(label.detach())
                del preds

        y_true = torch.cat(y_true, axis=0)
        y_pred = torch.cat(y_pred, axis=0)
        sco = eval_score(y_true.cpu().numpy(), y_pred.cpu().numpy())
        print('Epoch: {}, Train Loss: {}, Valid Score: {}'.format(
            i + 1, loss_epoch, sco))
        if sco > best_score:
            best_score = sco
            not_improved_count = 0
            best_state = True
        else:
            not_improved_count += 1
            best_state = False

        if not_improved_count == args['early_stop_patience']:
            print("Validation performance didn\'t improve for {} epochs. "
                  "Training stops.".format(args['early_stop_patience']))
            break
        if best_state:
            best_model = deepcopy(model.state_dict())
            torch.save(best_model, save_dir)
            # torch.save(model, '../user_data/ref.pkl')
            print('Model saved successfully:', save_dir)
Example #25
0
"""Trains the model."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np

from data_loader import load_train_data, univariate_data
from rnn_predictor import RNNPredictor

raw_data = load_train_data()
data, labels = univariate_data(raw_data["ConfirmedCases"], 0, None, 5, 0)

val_data, val_labels = [list(data[20]), list(data[40]), list(data[63])], [labels[20], labels[40], labels[63]]
train_data, train_labels = np.delete(data, [20, 40, 63], axis=0), np.delete(labels, [20, 40, 63], axis=0)

val_data_set = tf.data.Dataset.from_tensor_slices((val_data, val_labels))
data_set = tf.data.Dataset.from_tensor_slices((train_data, train_labels))

data_set = data_set.shuffle(len(list(data_set.as_numpy_iterator())), reshuffle_each_iteration=True)
data_set = data_set.repeat(5)

predictor = RNNPredictor(256)

epochs = range(10000)

loss_object = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adam(lr=1e-2, clipvalue=1)
Example #26
0
from inference import inference
from postprocess import get_rle_encoding

CW_DIR = os.getcwd()
TRAIN_DIRS = [os.path.join(os.path.dirname(CW_DIR), 'data_bowl', 'data', 'stage1_train'),
             os.path.join(os.path.dirname(CW_DIR), 'data_bowl', 'extra_data'),
             os.path.join(os.path.dirname(CW_DIR), 'data_bowl', 'stage1_test',\
                          'DSB2018_stage1_test-master', 'stage1_test')]
TEST_DIR = os.path.join(os.path.dirname(CW_DIR), 'data_bowl', 'stage2_test')
IMG_DIR_NAME = 'images'
MASK_DIR_NAME = 'masks'

train_df = read_train_data_properties(TRAIN_DIRS, IMG_DIR_NAME, MASK_DIR_NAME)
test_df = read_test_data_properties(TEST_DIR, IMG_DIR_NAME)

x_train, y_train, contour_train, no_contour_train = load_train_data(train_df)
y_train_full = np.array([
    np.concatenate((x, y, z), axis=2)
    for x, y, z in zip(y_train, contour_train, no_contour_train)
])
labels_train = get_train_labels(train_df)

x_test = load_test_data(test_df)

model_paths = train(train_df, y_train_full, labels_train)
y_prediction = inference(x_test, model_paths)
y_test_rle, y_test_ids = get_rle_encoding(test_df, y_prediction)

sub = pd.DataFrame()
sub['ImageId'] = y_test_ids
sub['EncodedPixels'] = pd.Series(y_test_rle).apply(