Ejemplo n.º 1
0
def train_with_generator(path, file, file_test, output, epochs, batch_size, checkpoint, shuffle=False, fast=False, sample_rate=16000):
    
    # prepare data
    x_train, y_train  =  prepare_data(file)
    x_test, y_test  =  prepare_data(file_test)
    num_classes = len(list(set(y_train)))
    print('nombre de classes : ', num_classes)
    #x_train, x_test, y_train , y_test = train_test_split(X, Y, test_size=0.2)#, stratify=y)

    train_generator = batch_generator_shuffle(batch_size, x_train, y_train, load_data_with_mel_spectrogram, n_mels=40, transpose=True, data_aug=True, proba_data_aug=0.7, coeff_amplitude=True, coeff_time=4000, fast=fast, new_sample_rate=sample_rate)
    
    test_generator = batch_generator(batch_size, x_test, y_test, load_data_with_mel_spectrogram, n_mels=40, transpose=True, data_aug=True, proba=0.7, coeff_amplitude=True, coeff_time=4000, fast=fast, new_sample_rate =sample_rate)
    
    step_train = math.ceil(len(x_train)/batch_size)
    print('step train :', step_train)
    step_test = math.ceil(len(x_test)/batch_size)
   
    print('shape:', len(x_train))
    print('shape:', len(x_test))
    print('step train :' , step_train)
    print('step test :' , step_test)
    
    #  network 
    model = get_model2((32, 40), num_classes, 3, 'relu')#vgg_style((99,161), num_classes)
    model.summary()
    sgd = keras.optimizers.SGD(lr=0.01, decay=1e-5, momentum=0.9, nesterov=True)
    #adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['categorical_accuracy'])
#    
#    with open(output+'.json','w') as f:
#        json_string = model.to_json()
#        f.write(json_string)
        
    # callback
    callback_tensorboard = keras.callbacks.TensorBoard(log_dir='./logs/'+output, histogram_freq=0, 
                                                       batch_size=batch_size, write_graph=True, write_grads=False, 
                                                       write_images=False, embeddings_freq=0, embeddings_layer_names=None, 
                                                       embeddings_metadata=None)
    # -{epoch:02d}
    reduce = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0)
    checkpoints = ModelCheckpoint(output+'.hdf5', verbose=1, save_best_only=True, period=checkpoint, save_weights_only=False)
    callbacks_list = [callback_tensorboard, checkpoints, reduce]
    
    # train 
    model.fit_generator(train_generator,
          steps_per_epoch=step_train,
          epochs=epochs,
          verbose=1,
          
          validation_data=test_generator,
          validation_steps=step_test,
          callbacks=callbacks_list)
Ejemplo n.º 2
0
def main(training_file, test_file, submission_file, ratio):
    data = utilities.read_file(training_file)
    test_data = utilities.read_file(test_file)

    print 'Preparing data...'
    x, y = preprocess.prepare_data(data)
    refid, x_test = preprocess.prepare_test_data(test_data)
    x, x_test = preprocess.preprocess_features(x, x_test)

    print 'Feature extracting...'
    x, x_test = feature_extraction.create_feature(x, y, x_test)

    indices = feature_extraction.get_best_k_feature_indices(x, y, 300)
    x = feature_extraction.get_best_k_features(x, indices)
    x_test = feature_extraction.get_best_k_features(x_test, indices)
    print 'Get %s features.' % len(x[0])

    x_train, x_cv, y_train, y_cv = cross_validation.train_test_split(
        x, y, test_size=.3, random_state=0)
    x_train, y_train = preprocess.down_sample(x_train, y_train, ratio)

    clf = classification.random_forest(x_train, y_train, x_cv, y_cv)

    print 'Predicting...'
    predict = clf.predict_proba(x_test)
    utilities.write_submission_file(submission_file, refid, predict)
Ejemplo n.º 3
0
def main(training_file, test_file, submission_file, ratio):
    data = utilities.read_file(training_file)
    test_data = utilities.read_file(test_file)

    print 'Preparing data...'
    x, y = preprocess.prepare_data(data)
    refid, x_test = preprocess.prepare_test_data(test_data)
    x, x_test = preprocess.preprocess_features(x, x_test)

    print 'Feature extracting...'
    x, x_test = feature_extraction.create_feature(x, y, x_test)

    indices = feature_extraction.get_best_k_feature_indices(x, y, 300)
    x = feature_extraction.get_best_k_features(x, indices)
    x_test = feature_extraction.get_best_k_features(x_test, indices)
    print 'Get %s features.' % len(x[0])

    x_train, x_cv, y_train, y_cv = cross_validation.train_test_split(
        x, y, test_size=.3, random_state=0)
    x_train, y_train = preprocess.down_sample(x_train, y_train, ratio)

    clf = classification.random_forest(x_train, y_train, x_cv, y_cv)

    print 'Predicting...'
    predict = clf.predict_proba(x_test)
    utilities.write_submission_file(submission_file, refid, predict)
Ejemplo n.º 4
0
def train_word2vec():
    cv = KFold(n_splits=2, shuffle=True)
    X, y, _ = prepare_data(dataset_path=dataset_path)
    cv_data = cv.split(X)
    results = []
    prediction_list = []
    fold_count =1
    # hyperparameters for word2vec
    most_common_words= []
    for traincv, testcv in cv_data:

        print("\n--------Fold {}--------\n".format(fold_count))
        # get the train and test from the dataset.
        X_train, X_test, y_train, y_test = X.iloc[traincv], X.iloc[testcv], y.iloc[traincv], y.iloc[testcv]
        train_essays = X_train['essay']
        #print("y_train",y_train)
        test_essays = X_test['essay']
        #y_train = torch.tensor(y_train,dtype=torch.long)
        train_sentences = []

        for essay in train_essays:
            # get all the sentences from the essay
            train_sentences.append(essay_to_wordlist(essay, remove_stopwords = True))

        # word2vec embedding
        print("Converting sentences to word2vec model")
        model,_ = build_word2vec(train_sentences, num_workers, num_features, min_word_count, context,
                     downsampling)
        top10 = collections.defaultdict(int)


        trainDataVecs = np.array(getAvgFeatureVecs(train_sentences, model, num_features))
        test_sentences = []
        for essay_v in test_essays:
            test_sentences.append(essay_to_wordlist(essay_v, remove_stopwords=True))
        testDataVecs = np.array(getAvgFeatureVecs(test_sentences, model, num_features))
        trainDataVectors = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
        testDataVectors = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))
        lstm_model = get_model(Hidden_dim1=Hidden_dim1, Hidden_dim2=Hidden_dim2, return_sequences=return_sequences,
                               dropout=dropout, recurrent_dropout=recurrent_dropout, input_size=input_size,
                               activation=activation, model_name=model_name, optimizer=optimizer, loss_function=loss_function)
        lstm_model.fit(trainDataVectors, y_train, batch_size=batch_size, epochs=epoch)
        y_pred = lstm_model.predict(testDataVectors)
        y_pred = np.around(y_pred)
        np.nan_to_num(y_pred)
        result = cohen_kappa_score(y_test.values, y_pred, weights='quadratic')
        print("Kappa Score: {}".format(result))
        results.append(result)
        fold_count += 1

    print("Average kappa score value is : {}".format(np.mean(np.asarray(results))))
Ejemplo n.º 5
0
def evaluate_accuracy(sess, acc, x, x_mask, y, eval_data, eval_labels, mb, maxlen):
    """
    Accuracy evaluation 
    """
    N_samples = len(eval_data)
    acc_mb = np.zeros(N_samples // mb)

    for j in trange(len(acc_mb)):
        # Sample data
        idx = np.arange(j*mb, (j+1)*mb)
        _x, _x_mask = preprocess.prepare_data(idx, eval_data, maxlen)
        _y = preprocess.prepare_labels(idx, eval_labels)

        # Test
        feed_dict = {x: _x, x_mask: _x_mask, y: _y}
        acc_mb[j] = sess.run(acc, feed_dict=feed_dict)

    acc_avg = np.mean(acc_mb)
    return acc_avg
Ejemplo n.º 6
0
def main():    
    # Setup configuration class
    config = TrainConfig()

    # Load dataset iterator
    train_iter, test_iter, config = preprocess.prepare_data(config)
    config.display()


    # Setup and build coarse2fine training inference
    attn = Attention
    enc = Encoder
    dec = Decoder

    model = Seq2Seq(enc, dec,).to(config.DEVICE)

    # Initialize network -> Load the pretrained embeddings onto our model
    ## pretrained_embeddings = quote.vocab.vectors
    ## model.embedding.weight.data.copy_(pretrained_embeddings)


    # initialize the model to a special initialization, and calculate the trainable parameter
    model.apply(init_weights_base)
    print(colored(f'The model has {count_parameters(model):,} trainable parameters'),'red')



    # Initialize the loss function and create an optimizer
    criterion = nn.CrossEntropyLoss(ignore_index=config.TRG_PAD_IDX)
    optimizer = optim.Adam(model.parameters())


    # Save vocabulary at last

    # Start training
    if config.K_FOLD:
        pass
    else:
        for epoch in range(N_EPOCHS):
            best_model, best_epach = 
Ejemplo n.º 7
0
def build_visualization():
    cv = KFold(n_splits=2, shuffle=True)
    X, y = prepare_data(dataset_path=dataset_path)
    cv_data = cv.split(X)
    results = []
    prediction_list = []
    fold_count = 1
    # hyperparameters for word2vec
    most_common_words = []
    for traincv, testcv in cv_data:
        top10 = collections.defaultdict(int)
        print("\n--------Fold {}--------\n".format(fold_count))
        # get the train and test from the dataset.
        X_train, X_test, y_train, y_test = X.iloc[traincv], X.iloc[
            testcv], y.iloc[traincv], y.iloc[testcv]
        train_essays = X_train['essay']
        #print("y_train",y_train)
        test_essays = X_test['essay']
        #y_train = torch.tensor(y_train,dtype=torch.long)
        train_sentences = []

        for essay in train_essays:
            # get all the sentences from the essay
            train_sentences += essay_to_sentences(essay, remove_stopwords=True)

        # word2vec embedding
        print("Converting sentences to word2vec model")
        model, sorted_dic = build_word2vec(train_sentences, num_workers,
                                           num_features, min_word_count,
                                           context, downsampling)

        for k, v in sorted_dic[:10]:
            print("----------most_similar_word_for:" + str(k) +
                  "--------------")
            print(model.wv.most_similar(k))

        top10 = collections.defaultdict(int)
        tsne_plot(model)
Ejemplo n.º 8
0
def train_bert_sets():
    warnings.filterwarnings('ignore')
    ## Sets experiment BERT
    data, target, sets = prepare_data(dataset_path=dataset_path)
    warnings.filterwarnings('ignore')
    set_count = 1
    all_sets_score = []
    for s in sets:
        print("\n--------SET {}--------\n".format(set_count))
        X = s
        y = s['domain1_score']
        cv = KFold(n_splits=5, shuffle=True)
        cv_data = cv.split(X)
        results = []
        prediction_list = []
        fold_count = 1
        cuda = torch.device('cuda')
        # For DistilBERT:
        model_class, tokenizer_class, pretrained_weights = (
            ppb.DistilBertModel, ppb.DistilBertTokenizer,
            'distilbert-base-uncased')
        ## Want BERT instead of distilBERT? Uncomment the following line:
        ##model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
        # Load pretrained model/tokenizer
        tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
        model = model_class.from_pretrained(pretrained_weights)
        with torch.cuda.device(cuda):
            for traincv, testcv in cv_data:
                torch.cuda.empty_cache()
                print("\n--------Fold {}--------\n".format(fold_count))
                # get the train and test from the dataset.
                X_train, X_test, y_train, y_test = X.iloc[traincv], X.iloc[
                    testcv], y.iloc[traincv], y.iloc[testcv]
                train_essays = X_train['essay']
                # print("y_train",y_train)
                test_essays = X_test['essay']
                # model = model.cuda()
                # y_train = torch.tensor(y_train,dtype=torch.long)
                sentences = []
                tokenize_sentences = []
                train_bert_embeddings = []
                # bert_embedding = BertEmbedding()
                # for essay in train_essays:
                #   # get all the sentences from the essay
                #   sentences += essay_to_sentences(essay, remove_stopwords = True)
                # sentences = pd.Series(sentences)
                # print(train_essays)
                tokenized_train = train_essays.apply(
                    (lambda x: tokenizer.encode(
                        x, add_special_tokens=True, max_length=200)))
                tokenized_test = test_essays.apply((lambda x: tokenizer.encode(
                    x, add_special_tokens=True, max_length=200)))

                ## train
                max_len = 0
                for i in tokenized_train.values:
                    if len(i) > max_len:
                        max_len = len(i)
                padded_train = np.array([
                    i + [0] * (max_len - len(i))
                    for i in tokenized_train.values
                ])

                attention_mask_train = np.where(padded_train != 0, 1, 0)

                train_input_ids = torch.tensor(padded_train)
                train_attention_mask = torch.tensor(attention_mask_train)
                with torch.no_grad():
                    last_hidden_states_train = model(
                        train_input_ids, attention_mask=train_attention_mask)

                train_features = last_hidden_states_train[0][:, 0, :].numpy()

                ## test
                max_len = 0
                for i in tokenized_test.values:
                    if len(i) > max_len:
                        max_len = len(i)
                padded_test = np.array([
                    i + [0] * (max_len - len(i)) for i in tokenized_test.values
                ])
                attention_mask_test = np.where(padded_test != 0, 1, 0)
                test_input_ids = torch.tensor(padded_test)
                test_attention_mask = torch.tensor(attention_mask_test)

                with torch.no_grad():
                    last_hidden_states_test = model(
                        test_input_ids, attention_mask=test_attention_mask)

                test_features = last_hidden_states_test[0][:, 0, :].numpy()

                train_x, train_y = train_features.shape
                test_x, test_y = test_features.shape

                trainDataVectors = np.reshape(train_features,
                                              (train_x, 1, train_y))
                testDataVectors = np.reshape(test_features,
                                             (test_x, 1, test_y))

                lstm_model = get_model(Hidden_dim1=Hidden_dim1,
                                       Hidden_dim2=Hidden_dim2,
                                       return_sequences=return_sequences,
                                       dropout=dropout,
                                       recurrent_dropout=recurrent_dropout,
                                       input_size=input_size,
                                       activation=activation,
                                       loss_function=loss_function,
                                       optimizer=optimizer,
                                       model_name=model_name)
                history = lstm_model.fit(trainDataVectors,
                                         y_train,
                                         batch_size=batch_size,
                                         epochs=epoch)
                plot_accuracy_curve(history)
                y_pred = lstm_model.predict(testDataVectors)

                y_pred = np.around(y_pred)
                # y_pred.dropna()
                np.nan_to_num(y_pred)
                # evaluate the model
                result = cohen_kappa_score(y_test.values,
                                           y_pred,
                                           weights='quadratic')
                print("Kappa Score: {}".format(result))
                results.append(result)
                fold_count += 1
                tf.keras.backend.clear_session()

        all_sets_score.append(results)
        print("Average kappa score value is : {}".format(
            np.mean(np.asarray(results))))
        set_count += 1
Ejemplo n.º 9
0
def main():
    print("Document classification for Multi-domain sentimental dataset")
    basefile = 'sorted_data'  # folder containing 342k data
    if args.PREP_DATA == True:  # prepare data
        print('cleaning the train data')
        cleaned_data = prepare_data(basefile)
        print('saving the prepared data')
        prepared_data_list = cleaned_data  #[train_cleaned_data, train_labels, train_num_words, dictionary, test_cleaned_data, test_labels, test_num_words]
        with open("preprocessed_multisent_data.txt", "wb") as f:
            pickle.dump(prepared_data_list, f)
    else:
        print('Loading the prepared data')
        with open("preprocessed_multisent_data.txt", "rb") as f:
            prepared_data_list = pickle.load(f)
        cleaned_data = prepared_data_list
    #**********************
    # Just for doing K fold CV
    dataX = []
    dataY = []
    for d in cleaned_data:
        dataX.append(d[1])
        dataY.append(d[0])

    #**********************
    start = time.time()
    skf = StratifiedKFold(
        n_splits=args.K_FOLD, shuffle=True,
        random_state=None)  # Change random_state to 'int', act as a seed
    skf.get_n_splits(dataX, dataY)
    print(skf)
    k = 0
    valid_loss = []
    valid_auc = []
    # Run for these many batches (ref Alg 2)
    num_batches = args.NUM_BATCHES
    batch_size = args.BATCH_SIZE
    if batch_size % 4 != 0:
        print(
            "********POSSIBLE ERROR, as the data may not be properly divisble over all the GPUs "
        )
    learning_rate = args.lr
    accuracy_data = []
    loss_plot = {}
    auc_plot = {}
    if args.DO_CV == True:
        for train_index, valid_index in skf.split(dataX, dataY):
            loss_plot[k] = []
            auc_plot[k] = []
            #******************** get the train and test dataset
            train_cleaned_data, train_labels, train_num_words, dictionary, valid_cleaned_data, valid_labels, valid_num_words\
                                                                        = get_CV_data(cleaned_data, train_index, valid_index)
            print('value counts for valid set')
            bc = np.bincount(train_labels)
            ii = np.nonzero(bc)[0]
            print(np.vstack((ii, bc[ii])).T)

            print('Getting different weights for cost sensitive learning')
            weight_vector = cost_sensitive_weights(train_labels)

            train_cleaned_data = numpy_fillna(train_cleaned_data)
            #            TRUNC_LENGTH = int(2*np.mean(train_num_words))
            TRUNC_LENGTH = int(np.mean(train_num_words))  #+ 50
            #            TRUNC_LENGTH = int(np.median(train_num_words))# + 50
            print('Trucation length, done for vectorization of code',
                  TRUNC_LENGTH)
            train_cleaned_data, train_num_words = truncate_data(
                train_cleaned_data, train_num_words,
                TRUNC_LENGTH)  # max_words in a Document
            VOCAB_SIZE = int(len(dictionary)) + 1
            #***************************************************
            # Re-initialise the model
            # initialise the parameters of the model
            if args.MODEL_NAME is 'CoNN':
                model = CoNN(args.MODEL_NAME,
                             args.hilbert_DIM,
                             args.WORD2VEC,
                             VOCAB_SIZE,
                             TRUNC_LENGTH,
                             dropout_p=args.DROPOUT,
                             USE_CUDA=args.USE_CUDA,
                             num_CLASS=args.num_CLASS)
            else:
                print("model not present")

            if args.USE_CUDA == True:
                model = model.cuda(
                )  #:TODO: Does this work!! maybe put NN modules on GPU...
                print('WRAPPING around DataParallel.. ')
                model = nn.DataParallel(model)
            if args.MODEL_NAME is 'CoNN' and args.optimizer == 'adam':
                optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), \
                                                                              eps=1e-08, weight_decay=0)

            print('Adding Scheduler')
            scheduler = MultiStepLR(
                optimizer,
                milestones=[1000, 1200, 1500, 1800, 2400, 3000, 3500],
                gamma=0.5)

            print('K_fold = ', k)
            for t in range(1, num_batches + 1):  # change to number of epochs\
                scheduler.step()
                if t % 100 == 0:  # print learning rate after every 100 epochs
                    for param_group in optimizer.param_groups:
                        print('lr ', param_group['lr'])

                #******************************************************************************
                #**************************TRAINING********************************************
                #******************************************************************************
                # sample docs from train_cleaned_data
                Xs, ys, Xs_num_words, batch_weight_vector = vectorized_sample_data(train_cleaned_data, train_labels,\
                                                                            train_num_words, batch_size, weight_vector)
                optimizer.zero_grad()  # zero the gradient buffers
                output = model.forward(Xs, Xs_num_words,
                                       args.ITERATIONS)  # DROPOUT of p
                loss = F.binary_cross_entropy_with_logits(
                    output.view(len(ys)), ys, weight=batch_weight_vector)
                # NOTE: use weights for cost sensitive learning
                # calculating the accuracy
                ys_actual = ys.data.cpu().numpy()
                scores_pred = output.data.cpu().numpy()
                auc = get_auc(ys_actual, scores_pred)
                print('time since start ', time_since(start, float(t)/float(num_batches)),'(%d %d%%)'\
                                  %(t, float(t)/float(num_batches)*100 ), 'loss', loss.data.cpu().numpy(), ' auc ', auc)
                loss_plot[k].append(loss.data.cpu().numpy())
                auc_plot[k].append(auc)
                loss.backward()
                optimizer.step()
                #******************************************************************************
                #****************************END***********************************************

                #******************************************************************************
                #****************************INTERMEDIATE EVALUATION***************************
                #******************************************************************************
                if args.EVALUATE_TEST == True and t % args.EVALUATE_EVERY == 0:  # evaluate the model after every 100 iterations
                    if t == args.EVALUATE_EVERY:
                        valid_cleaned_data = numpy_fillna(valid_cleaned_data)
                        valid_cleaned_data, valid_num_words = truncate_data(valid_cleaned_data, valid_num_words,\
                                                                                                     TRUNC_LENGTH)
                        Xt, yt, Xt_num_words, _ = vectorized_sample_data(valid_cleaned_data, \
                                                  valid_labels, valid_num_words, 'dummy_len(valid_index)',\
                                                                  TESTING_FLAG=True, weight_vector='dummy')
                        # Hoping that it runs for all the validation set at once!!
                    # NOTE: maintaining the same batch size for validation too
                    output = []
                    num_valid_batches = int(np.ceil(len(yt) / batch_size))
                    for b in range(num_valid_batches):
                        if b == num_valid_batches - 1:
                            batch_output = model.forward(Xt[batch_size*b:, :],\
                                                  Xt_num_words[batch_size*b:], args.ITERATIONS, TEST_FLAG=1)
                            output.append(batch_output.data.cpu().numpy())
                            # NOTE: default dropout is '0' for valid and test case! (as we need to take the ensemble of DNNs)
                        else:
                            batch_output = model.forward(Xt[batch_size*b:batch_size*(b+1), :],\
                                          Xt_num_words[batch_size*b:batch_size*(b+1)], args.ITERATIONS, TEST_FLAG=1)
                            output.append(batch_output.data.cpu().numpy())
                            # NOTE: default dropout is '0' for valid and test case! (as we need to take the ensemble of DNNs)
                    output = list(itertools.chain.from_iterable(output))
                    total_valid_output = Variable(
                        torch.from_numpy(np.array(output)).type(
                            torch.cuda.FloatTensor))
                    loss = F.binary_cross_entropy_with_logits(
                        total_valid_output.view(len(yt)), yt, weight=None)
                    # calculating the accuracy
                    yt_actual = yt.data.cpu().numpy()
                    scores_pred = output  #.data.cpu().numpy()
                    auc = get_auc(yt_actual, scores_pred)
                    print('Valid loss for k =', k, ' Iteration num ', t, '\n loss ', \
                                                   loss.data.cpu().numpy()[0], '\n auc ', auc)
                #******************************************************************************
                #*****************************END**********************************************

            loss_plot[k] = np.vstack(loss_plot[k])
            loss_plot[k] = loss_plot[k][:, 0]

            # NOTE: maintaining the same batch size for validation too
            #******************************************************************************
            #**************************VALIDATION******************************************
            #******************************************************************************
            print('checking loss on validation set')
            valid_cleaned_data = numpy_fillna(valid_cleaned_data)
            valid_cleaned_data, valid_num_words = truncate_data(valid_cleaned_data,\
                                                              valid_num_words, TRUNC_LENGTH) # max_words in a Document
            Xs, ys, Xs_num_words, _ = vectorized_sample_data(valid_cleaned_data,\
                                       valid_labels, valid_num_words, 'dummy_len(valid_index)',\
                                        TESTING_FLAG=True, weight_vector='dummy')
            # Hoping that it runs for all the validation set at once!!
            output = []
            num_valid_batches = int(np.ceil(len(ys) / batch_size))
            for b in range(num_valid_batches):
                if b == num_valid_batches - 1:
                    batch_output = model.forward(Xs[batch_size*b:, :],\
                                       Xs_num_words[batch_size*b:], args.ITERATIONS, TEST_FLAG=1)
                    output.append(batch_output.data.cpu().numpy())
                    # NOTE: default dropout is '0' for valid and test case! (as we need to take the ensemble of DNNs)
                else:
                    batch_output = model.forward(Xs[batch_size*b:batch_size*(b+1), :],\
                                  Xs_num_words[batch_size*b:batch_size*(b+1)], args.ITERATIONS, TEST_FLAG=1)
                    output.append(batch_output.data.cpu().numpy())
                    # NOTE: default dropout is '0' for valid and test case! (as we need to take the ensemble of DNNs)
            output = list(itertools.chain.from_iterable(output))
            total_valid_output = Variable(
                torch.from_numpy(np.array(output)).type(
                    torch.cuda.FloatTensor))
            loss = F.binary_cross_entropy_with_logits(total_valid_output.view(
                len(ys)),
                                                      ys,
                                                      weight=None)
            ys_actual = ys.data.cpu().numpy()
            scores_pred = output  #.data.cpu().numpy()
            auc = get_auc(ys_actual, scores_pred)
            print('Valid loss for k =', k, ' is ', loss, '\n', ' auc ', auc)
            valid_loss.append(loss.data.cpu().numpy())  # loss.data
            valid_auc.append(auc)
            #******************************************************************************
            #**************************VALIDATION end**************************************
            #******************************************************************************
            k += 1

        valid_loss = np.vstack(valid_loss)
        valid_loss = valid_loss[:, 0]
        print('K-fold valid loss ', valid_loss)
        print('K-fold auc ', valid_auc)
        print('avg auc ',
              np.array(valid_auc).mean(), ' std dev',
              np.array(valid_auc).std())
        with open('cv_results_multisent', 'wb') as fp:
            pickle.dump([valid_loss, valid_auc, loss_plot, auc_plot], fp)

    print('saving the model')
    torch.save(model, 'CoNN_model_multisent.pt')
    return
Ejemplo n.º 10
0
# ----------------- THE STEPS BELOW ARE SPECIFIC TO EACH DATA SET -----------------
# ----------------- WRITE YOUR OWN data2tsv() and prepare_custom() FUNCTIONS
# ----------------- THE STEPS BELOW ARE SPECIFIC TO EACH DATA SET -----------------
if to_tsv == True:
    # convert raw data to input tsv format
    # raw_corpus2tsv.xml2tsv("../test/")
    raw_corpus2tsv.xml2tsv("/gpfs/data/datasci/paper-m/raw/hansard_xml/")

if custom_prep == True:
    # do data prep custom to hansard
    text = preprocess.prepare_custom(data_dt)
# ----------------- THE STEPS ABOVE ARE SPECIFIC TO EACH DATA SET -----------------

if prepare_data == True:
    # preprocess tsv data for topic modeling
    preprocess.prepare_data(text)

if mallet_import == True:
    # load the mallet module
    os.system("module load mallet/2.0.8rc3")
    mallet.imprt()

if topic_model == True:
    # load the mallet module
    os.system("module load mallet/2.0.8rc3")
    # import preprocessed data to mallet objects and train LDA model
    for topic in n_topics:
        mallet.lda(n_topics)

if rank_documents == True:
    # rank documents by chosen topic(s)
Ejemplo n.º 11
0
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
import preprocess as pr
import mathfunc as mf
import random
import keras
import numpy as np
from keras import optimizers
from keras.models import Sequential
from keras.layers import Activation, Dense
import matplotlib.pyplot as plt
import learn
from keras.models import model_from_json
from keras.models import load_model

json_file = open('model_0.json', 'r')
model = model_from_json(json_file.read())
model.load_weights('model_0.h5')

data = pr.prepare_data('online_shoppers_intention.csv', 12330, 18)
inputs = np.asarray(pr.extractInputs(data))
targets = np.asarray(pr.extractTargets(data))

predictedOutput = model.predict(np.array([inputs[0]]))

print('Real output : ', targets[0])
print('Predicted output : ', predictedOutput[0])
Ejemplo n.º 12
0
    parser.add_argument(
        '--crop',
        default=1.0,
        type=float,
        help="If 1 no crop, if 0.25 crop 25%% from top and bottom")
    parser.add_argument('--tokenize', default=True,
                        type=str2bool)  # set false to read pretokenized data
    parser.add_argument('--save_models', default=True, type=str2bool)

    args = parser.parse_args()
    # args, unknown = parser.parse_known_args()  # use this version in jupyter notebooks to avoid conflicts

    init_random_seeds(args.seed)

    # if args.prepare:
    prepare_data(args)

    df_train, df_dev, df_test = read_files(args)

    # automatically identify the number of labels:
    num_labels = len(
        np.union1d(np.union1d(df_train['label'], df_dev['label']),
                   df_test['label']))
    args.num_labels = num_labels
    logging.info('Identified {} labels in the dataset.'.format(num_labels))

    if args.tokenize:
        train_data, dev_data, test_data = tokenize_data(
            args, df_train, df_dev, df_test)
    else:
        train_data, dev_data, test_data = read_tokenized_data(
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--path", type=str, default="dataset/^NSEI (3).csv", help="path of csv file"
    )

    parser.add_argument(
        "--trading_days", type=int, default=30, help="Number of trading days"
    )

    parser.add_argument(
        "--no_of_subsamples",
        type=int,
        default=1,
        help="Number of samples to take from csv file",
    )

    parser.add_argument(
        "--kernel",
        type=str,
        default="rbf",
        help="the kernel for SVM",
        choices=["linear", "rbf", "poly", "custom", "cobb-douglas"],
    )

    parser.add_argument(
        "--degree", type=int, default=3, help="value of p in polynomial/custom kernel"
    )

    parser.add_argument(
        "--C", type=float, default=1.0, help="the regularisation parameter for SVM"
    )

    parser.add_argument(
        "--gamma",
        type=float,
        default=1.0,
        help="the inner product coefficient in polynomial kernel",
    )

    parser.add_argument(
        "--coef0", type=float, default=0.0, help="coefficient for polynomial kernel"
    )

    parser.add_argument(
        "--train_test_ratio", type=float, default=0.75, help="fraction of train samples"
    )

    args = parser.parse_args()
    path = args.path
    trading_days = args.trading_days
    no_of_subsamples = args.no_of_subsamples
    kernel = args.kernel
    degree = args.degree
    C = args.C
    gamma = args.gamma

    if kernel == "poly":
        gamma = 1.0

    if kernel == "rbf":
        gamma = "scale"

    coef0 = args.coef0
    train_test_ratio = args.train_test_ratio

    print("Details: ")
    print("Extracting data from: " + str(path))
    print("Trading Days: " + str(trading_days))
    print("Number of Subsamples: " + str(no_of_subsamples))

    if kernel == "poly":
        assert (
            gamma == 1.0
        ), "Polynomial should have gamma=1.0, otherwise it is Cobb-Douglas Kernel"

        print("Kernel: polynomial")
        print("Degree: " + str(degree))

    elif kernel == "cobb-douglas":
        assert (
            gamma != 1.0
        ), "Cobb-Douglas should have gamma!=1.0, otherwise it is Polynomial Kernel"
        print("Kernel: cobb-douglas")
        print("Gamma: " + str(gamma))
        print("Degree: " + str(degree))
        kernel = "poly"

    elif kernel == "custom":
        print("Kernel: custom")
        print("Degree: " + str(degree))

    else:
        print("Kernel: " + str(kernel))

    print("Regularisation Parameter, C: " + str(C))

    # define the custom kernels

    def poly_cobb_kernel(X, Y):
        return gamma * (np.dot(X, Y.T)) ** degree

    def custom_kernel(X, Y):
        return 1 / (1 + np.dot(X, Y.T) ** degree)

    df = load_csv(path)

    data = prepare_data(data_f=df, horizon=trading_days, alpha=0.9,)

    # remove the output from the input
    features = [x for x in data.columns if x not in ["gain"]]

    dataA = np.array_split(data[features], no_of_subsamples)
    # print(dataA)
    train_acc, train_prec, train_recall, train_f1 = (0, 0, 0, 0)
    test_acc, test_prec, test_recall, test_f1 = (0, 0, 0, 0)

    t0 = time.time()

    stats = []
    for i in tqdm(range(no_of_subsamples)):
        features = [x for x in data.columns if x not in ["gain", "pred"]]
        X = dataA[i][features]
        y = dataA[i]["pred"]

        print((y == 1).sum())
        print((y == 0).sum())

        X_train = X[: int(train_test_ratio * len(X))]
        y_train = y[: int(train_test_ratio * len(y))]

        X_test = X[int(train_test_ratio * len(X)) :]
        y_test = y[int(train_test_ratio * len(y)) :]

        if kernel == "custom":
            clf = make_pipeline(
                StandardScaler(),
                SVC(
                    kernel=custom_kernel,
                    C=C,
                    class_weight="balanced",
                    cache_size=100000,
                ),
            )

        elif kernel == "poly":
            clf = make_pipeline(
                StandardScaler(),
                SVC(
                    kernel=poly_cobb_kernel,
                    C=C,
                    class_weight="balanced",
                    cache_size=100000,
                ),
            )

        else:
            clf = make_pipeline(
                StandardScaler(),
                SVC(
                    kernel=kernel,
                    C=C,
                    degree=degree,
                    coef0=coef0,
                    gamma=gamma,
                    class_weight="balanced",
                    cache_size=100000,
                ),
            )

        clf.fit(X_train, y_train)

        metrics = compute_acc(clf, X_train, y_train, X_test, y_test)

        stats.append(metrics)

        train_acc += metrics["training"][0]
        train_prec += metrics["training"][1]
        train_recall += metrics["training"][2]
        train_f1 += metrics["training"][3]

        test_acc += metrics["test"][0]
        test_prec += metrics["test"][1]
        test_recall += metrics["test"][2]
        test_f1 += metrics["test"][3]

        # print(metrics["training"][4])
        # print(metrics["test"][4])

    print("\nTime taken: " + str((time.time() - t0) / 60) + " minutes")

    for i in range(no_of_subsamples):

        print("Stats for Subsample#" + str(i + 1))
        print("Training Accuracy:\t" + str(stats[i]["training"][0]))
        print("Training Precision:\t" + str(stats[i]["training"][1]))
        print("Training Recall:\t" + str(stats[i]["training"][2]))
        print("Training F1:\t\t" + str(stats[i]["training"][3]))

        print("\n")

        print("Test Accuracy:\t\t" + str(stats[i]["test"][0]))
        print("Test Precision:\t\t" + str(stats[i]["test"][1]))
        print("Test Recall:\t\t" + str(stats[i]["test"][2]))
        print("Test F1:\t\t" + str(stats[i]["test"][3]))

        print("\n")

    print("Average Results")
    print("Average Training Accuracy:\t" + str(train_acc / no_of_subsamples))
    print("Average Training Precision:\t" + str(train_prec / no_of_subsamples))
    print("Average Training Recall:\t" + str(train_recall / no_of_subsamples))
    print("Average Training F1:\t\t" + str(train_f1 / no_of_subsamples))

    print("\n")

    print("Average Test Accuracy:\t\t" + str(test_acc / no_of_subsamples))
    print("Average Test Precision:\t\t" + str(test_prec / no_of_subsamples))
    print("Average Test Recall:\t\t" + str(test_recall / no_of_subsamples))
    print("Average Test F1:\t\t" + str(test_f1 / no_of_subsamples))
Ejemplo n.º 14
0
max_val_accuracy = 0.
max_test_accuracy = 0.

with tf.Session() as sess:
    # Initialize learnable weights, embeddings
    sess.run([init_op, W_init_op], feed_dict={W_emb: _W_emb})

    # Summary writer
    if args["log_tfevents"]:
        writer = tf.summary.FileWriter(os.path.join(output_dir, "TF_logs"),
                                       sess.graph)

    for i in trange(args["train_iters"]):
        # Sample minibatch
        idx = np.random.choice(N_train, args["mb"], replace=False)
        _x, _x_mask = preprocess.prepare_data(idx, data["train"],
                                              args["MAX_LEN"])
        _y = preprocess.prepare_labels(idx, labels["train"])

        # Run one train step
        feed_dict = {x: _x, x_mask: _x_mask, y: _y}
        if args["log_tfevents"]:
            _, _sum = sess.run([train_step, merged_sum], feed_dict=feed_dict)
            writer.add_summary(_sum, i)
        else:
            sess.run(train_step, feed_dict=feed_dict)

        # Validate
        if i % args["validation_rate"] == 0:
            val_accuracy = aux_functions.evaluate_accuracy(
                sess, accuracy, x, x_mask, y, data["valid"], labels["valid"],
                args["mb"], args["MAX_LEN"])
Ejemplo n.º 15
0
from encoderRNN import EncoderRNN
from attnDecoderRNN import AttnDecoderRNN
from training import train_iters
from stackRNN import SRNN_Softmax
# from evaluate import evaluate, evaluateRandomly

# run from root dir
# e.g., python seq2seq/main.py --data data/dataset_len100.tsv --length 100
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Seq2seq")
    parser.add_argument("--data", "-d", help="Path to datafile")
    parser.add_argument("--length", "-l", help="Length of max sequence")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    input_lang, output_lang, pairs = prepare_data(
        os.path.normpath(parser.parse_args().data), "infix", 'postfix')
    max_length = round(int(parser.parse_args().length) * 1.1)
    hidden_size = 256
    # encoder1 = EncoderRNN(input_lang.n_words, hidden_size, device).to(device)

    # Number of hidden units
    n_hidden = 256
    # Number of hidden layers
    n_layers = 1
    # Stack size
    stack_size = 104
    stack_dim = 1
    vocab_size = input_lang
    encoder1 = SRNN_Softmax(n_hidden, vocab_size, vocab_size, n_layers,
                            stack_size, stack_dim)
    attn_decoder1 = AttnDecoderRNN(n_hidden, output_lang.n_words, max_length,
Ejemplo n.º 16
0
def is_eq(pred_indices, label_vec):
    for idx in pred_indices:
        if label_vec[idx] == 1:
            return 1
    return 0

attr_lables = {
    '/m/02gy9n': 0, #Transparent
    '/m/05z87': 1, #Plastic
    '/m/0dnr7': 2, #(made of)Textile
    '/m/04lbp': 3, #(made of)Leather
    '/m/083vt': 4 #Wooden
}

img_id, id_bbox_dict, id_labels = preprocess.prepare_data(attr_lables)
feature_path = Path('features/')
feature_path_list = list(feature_path.glob('*.npy'))

feature_path_names = list(range(0,len(feature_path_list)-1))
shuffle(feature_path_names)
index = (8 * len(feature_path_names)) // 10

train_feature_path_idx = feature_path_names[:index]
val_feature_path_idx = feature_path_names[index:]

attr_classifier = torch.load('output/trained_model_15_0.0001_4999')
attr_classifier.eval()

total_accuracy = 0
softmax = nn.Softmax()
Ejemplo n.º 17
0
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

import input_output
import models
import preprocess

# Define constants
TEST_SIZE_SAMPLE = 0.25
RANDOM_STATE_SPLIT = 40

rawData = input_output.load_data("train.csv")
processedData = preprocess.prepare_data(rawData)

# Separate features and output + scale data
trainData = (processedData[:, 1:])
predOutput = processedData[:, 0]

X_train, X_test, y_train, y_test = train_test_split(trainData, predOutput, test_size=TEST_SIZE_SAMPLE,
                                                    random_state=RANDOM_STATE_SPLIT)

# Try to load classifier from file
clf = input_output.load_classifier("titanicCLF.pkl")
if not clf:
    # If no file is present, train the classifier using the best known parameters and save the classifier
    print("There is no saved classifier!")
    print("Training Model...")
    clf = svm.SVC(C=1, kernel="rbf", gamma=0.1)
Ejemplo n.º 18
0
def tuner(args, f, writer):
    path = args.path
    trading_days = args.trading_days
    kernel = args.kernel
    degree = args.degree
    C = [float(i) for i in args.C]
    gamma = args.gamma

    if kernel == "poly":
        gamma = 1.0

    if kernel == "rbf":
        gamma = "scale"

    coef0 = args.coef0
    train_test_ratio = args.train_test_ratio
    folds = args.folds

    # print("Details: ")
    # print("Extracting data from: " + str(path))
    # print("Trading Days: " + str(trading_days))

    if kernel == "poly":
        assert (
            gamma == 1.0
        ), "Polynomial should have gamma=1.0, otherwise it is Cobb-Douglas Kernel"

        # print("Kernel: polynomial")
        # print("Degree: " + str(degree))

    elif kernel == "cobb-douglas":
        assert (
            gamma != 1.0
        ), "Cobb-Douglas should have gamma!=1.0, otherwise it is Polynomial Kernel"
        # print("Degree in pipe1: " + str(degree))
        kernel = "poly"

    def poly_cobb_kernel(X, Y):
        return gamma * (np.dot(X, Y.T))**degree

    def custom_kernel(X, Y):
        return 1 / (1 + np.dot(X, Y.T)**degree)

    # load the dataset
    df = load_csv(path)
    data = prepare_data(data_f=df, horizon=trading_days, alpha=0.9)
    # print(data)
    # remove the output from the input
    features = [x for x in data.columns if x not in ["gain"]]

    dataA = np.array_split(data[features], args.no_of_subsamples)

    features = [x for x in data.columns if x not in ["gain", "pred"]]
    X = np.array(dataA[0][features])
    y = np.array(dataA[0]["pred"])

    #tscv = TimeSeriesSplit(n_splits=folds)
    param_grid = {"svc__C": C}

    # print("\n")
    metrics = {}
    for C in param_grid["svc__C"]:

        metrics[C] = {"accuracy": [], "precision": [], "recall": [], "f1": []}
        i = 0

        for i in tqdm(range(args.no_of_subsamples)):
            features = [x for x in data.columns if x not in ["gain", "pred"]]
            X = dataA[i][features]
            y = dataA[i]["pred"]
            # print("Fold #" + str(i + 1))
            # print("\n")

            X_train = X[:int(train_test_ratio * len(X))]
            y_train = y[:int(train_test_ratio * len(y))]

            X_test = X[int(train_test_ratio * len(X)):]
            y_test = y[int(train_test_ratio * len(y)):]

            X_train = rem_inf(X_train)
            X_test = rem_inf(X_test)

            # print(np.where(np.isinf(X_train)))
            # print(np.where(np.isinf(X_test)))

            if kernel == "custom":
                clf = make_pipeline(
                    StandardScaler(),
                    SVC(
                        kernel=custom_kernel,
                        C=C,
                        class_weight="balanced",
                        cache_size=100000,
                    ),
                )

            elif kernel == "poly":
                clf = make_pipeline(
                    StandardScaler(),
                    SVC(
                        kernel=poly_cobb_kernel,
                        C=C,
                        class_weight="balanced",
                        cache_size=100000,
                    ),
                )

            else:
                clf = make_pipeline(
                    StandardScaler(),
                    SVC(
                        kernel=kernel,
                        degree=degree,
                        C=C,
                        coef0=coef0,
                        gamma=gamma,
                        class_weight="balanced",
                        cache_size=100000,
                    ),
                )

            clf.fit(X_train, y_train)

            # print("Training Report:")
            y_train_pred = clf.predict(X_train)
            train_res = classification_report(y_train,
                                              y_train_pred,
                                              output_dict=True)
            # print(classification_report(y_train, y_train_pred))
            # print("\n")
            # print("Test Report:")
            y_test_pred = clf.predict(X_test)
            test_res = classification_report(y_test,
                                             y_test_pred,
                                             output_dict=True)
            # print(test_res)
            # print(classification_report(y_test, y_test_pred))
            metrics[C]["accuracy"].append(test_res["accuracy"])
            metrics[C]["precision"].append(test_res["macro avg"]["precision"])
            metrics[C]["recall"].append(test_res["macro avg"]["recall"])
            metrics[C]["f1"].append(test_res["macro avg"]["f1-score"])

            i += 1

    # print(metrics)
    # print("\n")
    max_recall_C = list(metrics.keys())[0]
    for C in metrics:
        if mean(metrics[C]["recall"]) > mean(metrics[max_recall_C]["recall"]):
            max_recall_C = C
        # print("\n")

    print("Best Results:\n")
    print(max_recall_C)
    print("Recall: " + str(mean(metrics[max_recall_C]["recall"])))

    f.write("\n\nBest Results of Tuner:\n")
    f.write("\nDegree: " + str(args.degree))
    f.write("\nC: " + str(max_recall_C))
    f.write("\ngamma: " + str(gamma))
    f.write("\nBest Recall Score: " +
            str(mean(metrics[max_recall_C]["recall"])))

    args1 = args
    args1.currC = max_recall_C
    trainer(args1, f, writer)
Ejemplo n.º 19
0
import pandas as pd
import numpy as np
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from NER import BiLSTM_CRF
import NER
import preprocess

EMBEDDING_DIM = 5
HIDDEN_DIM = 4
EPOCHS = 30
torch.manual_seed(1)
Z_train, Z_test, word2idx, tag2idx = preprocess.prepare_data("train.txt")

tag2idx["<START>"] = len(tag2idx) - 2
tag2idx["<STOP>"] = len(tag2idx) - 1
n_tags = len(tag2idx)
# =============================================================================
# MODEL
# =============================================================================

model = BiLSTM_CRF(len(word2idx), tag2idx, EMBEDDING_DIM, HIDDEN_DIM, n_tags)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# =============================================================================
# TRAINING
# =============================================================================
for epoch in range(EPOCHS):
Ejemplo n.º 20
0
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd

import input_output
import models
import preprocess
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

# Define constants
TEST_SIZE_SAMPLE = 0.0001
RANDOM_STATE_SPLIT = 38
NUMBER_OF_ENTRIES = 300000

rawData = input_output.load_data("train.csv")
processedData = preprocess.prepare_data(rawData)

# Separate features and output + scale data
# pandas has some weird column counting
scaler = StandardScaler()
multiBinarizer = MultiLabelBinarizer()

trainData = scaler.fit_transform(processedData[:, 0:2])

# Convert output to binarized array
numbers = np.reshape((processedData[:, 2]), (len(processedData[:, 2]), 1))
predOutput = multiBinarizer.fit_transform(numbers)

X_train, X_test, y_train, y_test = train_test_split(trainData, predOutput, test_size=TEST_SIZE_SAMPLE,
                                                    random_state=RANDOM_STATE_SPLIT)
Ejemplo n.º 21
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--path",
                        type=str,
                        default="dataset/NSEIdaily.csv",
                        help="path of csv file")

    parser.add_argument("--trading_days",
                        type=int,
                        default=1,
                        help="Number of trading days")

    parser.add_argument(
        "--kernel",
        type=str,
        default="rbf",
        help="the kernel for SVM",
        choices=["linear", "rbf", "poly", "custom", "cobb-douglas"],
    )

    parser.add_argument("--degree",
                        type=int,
                        default=3,
                        help="value of p in polynomial/custom kernel")

    parser.add_argument(
        "--C",
        nargs="+",
        default=[10**i for i in range(-100, 101)],
        help="the regularisation parameter for SVM",
    )

    parser.add_argument(
        "--gamma",
        type=float,
        default=1.0,
        help="the inner product coefficient in polynomial kernel",
    )

    parser.add_argument("--coef0",
                        type=float,
                        default=0.0,
                        help="coefficient for polynomial kernel")

    parser.add_argument("--train_test_ratio",
                        type=float,
                        default=0.75,
                        help="fraction of train samples")

    parser.add_argument("--folds",
                        type=int,
                        default=5,
                        help="k in k-fold cross validation")

    args = parser.parse_args()
    path = args.path
    trading_days = args.trading_days
    kernel = args.kernel
    degree = args.degree
    C = C = [float(i) for i in args.C]
    gamma = args.gamma

    if kernel == "poly":
        gamma = 1.0

    if kernel == "rbf":
        gamma = "scale"

    coef0 = args.coef0
    train_test_ratio = args.train_test_ratio
    folds = args.folds

    print("Details: ")
    print("Extracting data from: " + str(path))
    print("Trading Days: " + str(trading_days))

    if kernel == "poly":
        assert (
            gamma == 1.0
        ), "Polynomial should have gamma=1.0, otherwise it is Cobb-Douglas Kernel"

        print("Kernel: polynomial")
        print("Degree: " + str(degree))

    elif kernel == "cobb-douglas":
        assert (
            gamma != 1.0
        ), "Cobb-Douglas should have gamma!=1.0, otherwise it is Polynomial Kernel"
        print("Kernel: cobb-douglas")
        print("Gamma: " + str(gamma))
        print("Degree: " + str(degree))
        kernel = "poly"

    elif kernel == "custom":
        print("Kernel: custom")
        print("Degree: " + str(degree))

    else:
        print("Kernel: " + str(kernel))

    # print("Regularisation Parameter, C: " + str(C))

    # define the custom kernels
    def poly_cobb_kernel(X, Y):
        return gamma * (np.dot(X, Y.T))**degree

    def custom_kernel(X, Y):
        return 1 / (1 + np.dot(X, Y.T)**degree)

    # load the dataset
    df = load_csv(path)
    data = prepare_data(
        data_f=df,
        horizon=trading_days,
        alpha=0.9,
    )

    # remove the output from the input
    features = [x for x in data.columns if x not in ["gain"]]

    dataA = np.array_split(data[features], folds)

    features = [x for x in data.columns if x not in ["gain", "pred"]]
    X = np.array(dataA[0][features])
    y = np.array(dataA[0]["pred"])

    # tscv = TimeSeriesSplit(n_splits=folds)
    param_grid = {"svc__C": C}

    print("\n")
    metrics = {}
    for C in param_grid["svc__C"]:
        metrics[C] = {"accuracy": [], "precision": [], "recall": [], "f1": []}
        for i in tqdm(range(no_of_subsamples)):
            features = [x for x in data.columns if x not in ["gain", "pred"]]
            X = dataA[i][features]
            y = dataA[i]["pred"]
            print("Performing Grid (Time Series) Search on:\n")
            print("C: " + str(C))
            print("gamma: " + str(gamma))
            print("Fold #" + str(i + 1))
            print("\n")

            X_train = X[:int(train_test_ratio * len(X))]
            y_train = y[:int(train_test_ratio * len(y))]

            X_test = X[int(train_test_ratio * len(X)):]
            y_test = y[int(train_test_ratio * len(y)):]

            if kernel == "custom":
                clf = make_pipeline(
                    StandardScaler(),
                    SVC(
                        kernel=custom_kernel,
                        C=C,
                        class_weight="balanced",
                        cache_size=100000,
                    ),
                )

            elif kernel == "poly":
                clf = make_pipeline(
                    StandardScaler(),
                    SVC(
                        kernel=poly_cobb_kernel,
                        C=C,
                        class_weight="balanced",
                        cache_size=100000,
                    ),
                )

            else:
                clf = make_pipeline(
                    StandardScaler(),
                    SVC(
                        kernel=kernel,
                        degree=degree,
                        C=C,
                        coef0=coef0,
                        gamma=gamma,
                        class_weight="balanced",
                        cache_size=100000,
                    ),
                )

            clf.fit(X_train, y_train)

            print("Training Report:")
            y_train_pred = clf.predict(X_train)
            train_res = classification_report(y_train,
                                              y_train_pred,
                                              output_dict=True)
            print(classification_report(y_train, y_train_pred))
            print("\n")
            print("Test Report:")
            y_test_pred = clf.predict(X_test)
            test_res = classification_report(y_test,
                                             y_test_pred,
                                             output_dict=True)
            print(classification_report(y_test, y_test_pred))
            metrics[C]["accuracy"].append(test_res["accuracy"])
            metrics[C]["precision"].append(test_res["macro avg"]["precision"])
            metrics[C]["recall"].append(test_res["macro avg"]["recall"])
            metrics[C]["f1"].append(test_res["macro avg"]["f1-score"])

            i += 1

    print(metrics)
    print("\n")
    max_recall_C = list(metrics.keys())[0]
    for C in metrics:
        print("For regularisation parameter: " + str(C))
        print("Accuracy: " + str(mean(metrics[C]["accuracy"])))
        print("Precision: " + str(mean(metrics[C]["precision"])))
        print("Recall: " + str(mean(metrics[C]["recall"])))
        print("F1: " + str(mean(metrics[C]["f1"])))
        if mean(metrics[C]["recall"]) > mean(metrics[max_recall_C]["recall"]):
            max_recall_C = C
        print("\n")

    print("Best Results:\n")
    print(max_recall_C)
    print("Accuracy: " + str(mean(metrics[C]["accuracy"])))
    print("Precision: " + str(mean(metrics[max_recall_C]["precision"])))
    print("Recall: " + str(mean(metrics[max_recall_C]["recall"])))
    print("F1: " + str(mean(metrics[max_recall_C]["f1"])))
Ejemplo n.º 22
0
        # test_documents = []
        # for document in documents1:
        #     if document.name in pmids:
        #         test_documents.append(document)
        dev_documents = []
        for document in documents1:
            if document.name in pmids:
                dev_documents.append(document)
        test_documents = []

        abbr_dict = load_abbr(config['ncbi_abbr'])
        logging.info("loading dictionary ... ")
        dictionary = load_ctd(config['norm_dict'])

        logging.info("generate data points")
        train_datapoints = prepare_data(train_documents, abbr_dict, dictionary)
        dev_datapoints = prepare_data_1(
            dev_documents, abbr_dict, dictionary
        )  # we use dev_datapoints and test_datapoints only for build alphabet
        if len(test_documents) != 0:
            test_datapoints = prepare_data_1(test_documents, abbr_dict,
                                             dictionary)
        if opt.pretraining:
            dict_datapoints = prepare_dict_data(dictionary)

        logging.info("build alphabet ...")
        enc_word_alphabet = Alphabet('enc_word')
        if opt.use_char:
            enc_char_alphabet = Alphabet('enc_char')
        else:
            enc_char_alphabet = None
Ejemplo n.º 23
0
def trainer(args, f, writer):

    path = args.path
    trading_days = args.trading_days
    no_of_subsamples = args.no_of_subsamples
    kernel = args.kernel
    degree = args.degree
    C = args.currC
    gamma = args.gamma

    if kernel == "poly":
        gamma = 1.0

    if kernel == "rbf":
        gamma = "scale"

    coef0 = args.coef0
    train_test_ratio = args.train_test_ratio

    print("Details: ")
    print("Extracting data from: " + str(path))
    print("Trading Days: " + str(trading_days))
    print("Number of Subsamples: " + str(no_of_subsamples))

    if kernel == "poly":
        assert (
            gamma == 1.0
        ), "Polynomial should have gamma=1.0, otherwise it is Cobb-Douglas Kernel"

        print("Kernel: polynomial")
        print("Degree: " + str(degree))

    elif kernel == "cobb-douglas":
        assert (
            gamma != 1.0
        ), "Cobb-Douglas should have gamma!=1.0, otherwise it is Polynomial Kernel"
        print("Kernel: cobb-douglas")
        print("Gamma: " + str(gamma))
        print("Degree: " + str(degree))
        kernel = "poly"

    elif kernel == "custom":
        print("Kernel: custom")
        print("Degree: " + str(degree))

    else:
        print("Kernel: " + str(kernel))

    print("Regularisation Parameter, C: " + str(C))

    # define the custom kernels

    def poly_cobb_kernel(X, Y):
        return gamma * (np.dot(X, Y.T)) ** degree

    def custom_kernel(X, Y):
        return 1 / (1 + np.dot(X, Y.T) ** degree)

    df = load_csv(path)

    data = prepare_data(data_f=df, horizon=trading_days, alpha=0.9,)

    # remove the output from the input
    features = [x for x in data.columns if x not in ["gain"]]

    dataA = np.array_split(data[features], no_of_subsamples)
    # print(dataA)
    train_acc, train_prec, train_recall, train_f1 = (0, 0, 0, 0)
    test_acc, test_prec, test_recall, test_f1 = (0, 0, 0, 0)

    t0 = time.time()

    stats = []
    for i in tqdm(range(no_of_subsamples)):
        features = [x for x in data.columns if x not in ["gain", "pred"]]
        X = dataA[i][features]
        y = dataA[i]["pred"]

        # print((y == 1).sum())
        # print((y == 0).sum())

        X_train = rem_inf(X[: int(train_test_ratio * len(X))])
        y_train = y[: int(train_test_ratio * len(y))]

        X_test = rem_inf(X[int(train_test_ratio * len(X)) :])
        y_test = y[int(train_test_ratio * len(y)) :]

        if kernel == "custom":
            clf = make_pipeline(
                StandardScaler(),
                SVC(
                    kernel=custom_kernel,
                    C=C,
                    class_weight="balanced",
                    cache_size=100000,
                ),
            )

        elif kernel == "poly":
            clf = make_pipeline(
                StandardScaler(),
                SVC(
                    kernel=poly_cobb_kernel,
                    C=C,
                    class_weight="balanced",
                    cache_size=100000,
                ),
            )

        else:
            clf = make_pipeline(
                StandardScaler(),
                SVC(
                    kernel=kernel,
                    C=C,
                    degree=degree,
                    coef0=coef0,
                    gamma=gamma,
                    class_weight="balanced",
                    cache_size=100000,
                ),
            )

        clf.fit(X_train, y_train)

        metrics = compute_acc(clf, X_train, y_train, X_test, y_test)

        stats.append(metrics)

        train_acc += metrics["training"][0]
        train_prec += metrics["training"][1]
        train_recall += metrics["training"][2]
        train_f1 += metrics["training"][3]

        test_acc += metrics["test"][0]
        test_prec += metrics["test"][1]
        test_recall += metrics["test"][2]
        test_f1 += metrics["test"][3]

        # print(metrics["training"][4])
        # print(metrics["test"][4])

    print("\nTime taken: " + str((time.time() - t0) / 60) + " minutes")

    # for i in range(no_of_subsamples):

    #     print("Stats for Subsample#" + str(i + 1))
    #     print("Training Accuracy:\t" + str(stats[i]["training"][0]))
    #     print("Training Precision:\t" + str(stats[i]["training"][1]))
    #     print("Training Recall:\t" + str(stats[i]["training"][2]))
    #     print("Training F1:\t\t" + str(stats[i]["training"][3]))

    #     print("\n")

    #     print("Test Accuracy:\t\t" + str(stats[i]["test"][0]))
    #     print("Test Precision:\t\t" + str(stats[i]["test"][1]))
    #     print("Test Recall:\t\t" + str(stats[i]["test"][2]))
    #     print("Test F1:\t\t" + str(stats[i]["test"][3]))

    #     print("\n")

    print("Average Results")
    print("Average Training Accuracy:\t" + str(train_acc / no_of_subsamples))
    print("Average Training Precision:\t" + str(train_prec / no_of_subsamples))
    print("Average Training Recall:\t" + str(train_recall / no_of_subsamples))
    print("Average Training F1:\t\t" + str(train_f1 / no_of_subsamples))

    print("\n")

    print("Average Test Accuracy:\t\t" + str(test_acc / no_of_subsamples))
    print("Average Test Precision:\t\t" + str(test_prec / no_of_subsamples))
    print("Average Test Recall:\t\t" + str(test_recall / no_of_subsamples))
    print("Average Test F1:\t\t" + str(test_f1 / no_of_subsamples))

    f.write("\nAverage Results after Training")
    f.write("\nAverage Training Accuracy:\t" + str(train_acc / no_of_subsamples))
    f.write("\nAverage Training Precision:\t" + str(train_prec / no_of_subsamples))
    f.write("\nAverage Training Recall:\t" + str(train_recall / no_of_subsamples))
    f.write("\nAverage Training F1:\t\t" + str(train_f1 / no_of_subsamples))

    f.write("\n")

    f.write("\nAverage Test Accuracy:\t\t" + str(test_acc / no_of_subsamples))
    f.write("\nAverage Test Precision:\t\t" + str(test_prec / no_of_subsamples))
    f.write("\nAverage Test Recall:\t\t" + str(test_recall / no_of_subsamples))
    f.write("\nAverage Test F1:\t\t" + str(test_f1 / no_of_subsamples))

    writer.writerow(
        [
            degree,
            gamma,
            C,
            train_recall / no_of_subsamples,
            test_recall / no_of_subsamples,
            train_prec / no_of_subsamples,
            test_prec / no_of_subsamples,
            train_f1 / no_of_subsamples,
            test_f1 / no_of_subsamples,
            train_acc / no_of_subsamples,
            test_acc / no_of_subsamples,
        ]
    )
Ejemplo n.º 24
0
logfile = datetime.datetime.now().strftime("logs/%Y%m%d_%H%M") + ".log"

if __name__ == "__main__":
    n_jobs = 4
    use_cache = False
    logger = False

    if not use_cache:
        n_drivers = 10000
        n_jobs = 4
        windows = [1, 15, 30, 60]
        part = 4
        n_quantiles = 15
        size = None
        fname = "data/processed_part%i_q%s_%s.csv" % (part, n_quantiles, "w".join([str(w) for w in [""] + windows]))
        data = prepare_data(n_drivers, windows, n_quantiles, part, size, n_jobs)
        data.to_csv(fname)
    else:
        # use cache
        t = time()
        print "Loading cache...",
        data = pd.DataFrame.from_csv("data/processed.csv")
        print "DONE! %.2fm" % ((time() - t) / 60.0)

    eta_iteration = (np.array([2, 3, 4, 5, 10, 50, 100]) * n_jobs).tolist() + (
        np.array(range(200, 3000, 100) * n_jobs).tolist()
    )
    probas = []
    t = time()
    print "Predicting... estimated time:",
    if n_jobs > 1:
Ejemplo n.º 25
0
if __name__ == '__main__':
    n_jobs = 4
    use_cache = False
    logger = False

    if not use_cache:
        n_drivers = 10000
        n_jobs = 4
        windows = [1, 15, 30, 60]
        part = 4
        n_quantiles = 15
        size = None
        fname = "data/processed_part%i_q%s_%s.csv" % (
            part, n_quantiles, 'w'.join([str(w) for w in [''] + windows]))
        data = prepare_data(n_drivers, windows, n_quantiles, part, size,
                            n_jobs)
        data.to_csv(fname)
    else:
        # use cache
        t = time()
        print("Loading cache...", end=' ')
        data = pd.DataFrame.from_csv("data/processed.csv")
        print("DONE! %.2fm" % ((time() - t) / 60.))

    eta_iteration = (np.array([2, 3, 4, 5, 10, 50, 100]) * n_jobs).tolist() + (
        np.array(list(range(200, 3000, 100)) * n_jobs).tolist())
    probas = []
    t = time()
    print("Predicting... estimated time:", end=' ')
    if n_jobs > 1:
        # initialize logger and pool args
Ejemplo n.º 26
0
    parser = argparse.ArgumentParser(description="Trains image data.")
    parser.add_argument("num_iterations",
                        help="path to directory that contains data")
    parser.add_argument("learning_rate",
                        help="name of the fruit which you which to classify")
    parser.add_argument("directory_path",
                        help="path to directory that contains data")
    parser.add_argument("fruit",
                        help="name of the fruit which you which to classify")
    parser.add_argument("image_path",
                        help="path to image that you'd like to classify")
    args = parser.parse_args()

    # prepares the data
    x_train, x_test, y_train, y_test = preprocess.prepare_data(
        args.directory_path, args.fruit)

    # trains the model
    model = train.model(x_train,
                        y_train,
                        x_test,
                        y_test,
                        num_iterations=int(args.num_iterations),
                        learning_rate=float(args.learning_rate),
                        print_cost=False)

    # loads and reads the image you want to classify
    im_pix = imageio.imread(args.image_path)
    im_pix = im_pix / 255.
    im_pix = resize(im_pix, (100, 100), anti_aliasing=True)
    im_pix = im_pix.reshape(100 * 100 * 3, 1)