Python train_test_split Exemples, train_test_split.train_test_split Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : classifier.py Projet : sondor66/Offensive_Language_Detection_NLP

def model_checker(df, vectorizer, classifier, sampling_method):

    print(classifier)
    print('\n')

    trainTweet, testTweet, trainLabel, testLabel = train_test_split(
        df, sampling_method)

    pipeline = Pipeline([('vectorizer', vectorizer),
                         ('classifier', classifier)])
    t0 = time()
    sentiment_fit = pipeline.fit(trainTweet, trainLabel)
    y_pred = sentiment_fit.predict(testTweet)
    train_test_time = time() - t0

    accuracy = accuracy_score(testLabel, y_pred)
    confusion_result = confusion_matrix(y_pred, testLabel)

    print("accuracy score: {0:.2f}%".format(accuracy * 100))
    print("train and test time: {0:.2f}s".format(train_test_time))
    print('-' * 80)
    print("Confusion Matrix\n")
    print(pd.DataFrame(confusion_result))
    print('-' * 80)
    print("Classification Report\n")
    print(classification_report(testLabel, y_pred))

Exemple #2

0

Afficher le fichier

Fichier : trainer.py Projet : LuisVilarBarbosa/DISS

def main(config_filename):
    logger.debug("Starting execution.")
    parameters = Parameters(config_filename, training_mode=True)
    if parameters.preprocessed_data:
        if not isfile(parameters.excel_file) and not isfile(parameters.preprocessed_data_file):
            logger.error("Please, provide a valid Excel file or a valid preprocessed data file.")
            quit()
        if not isfile(parameters.preprocessed_data_file) and isfile(parameters.excel_file):
            logger.info("Loading Excel file.")
            data_frame = read_excel(parameters.excel_file)
            logger.info("Creating documents.")
            docs = data_frame_to_document_list(data_frame)
            logger.info("Storing generated documents.")
            pickle_manager.dump_documents(docs, parameters.preprocessed_data_file)
        logger.info("Preprocessing documents.")
        preprocessor = Preprocessor(stanfordnlp_language_package=parameters.stanfordnlp_language_package, stanfordnlp_use_gpu=parameters.stanfordnlp_use_gpu, stanfordnlp_resources_dir=parameters.stanfordnlp_resources_dir, training_mode=parameters.training_mode)
        preprocessor.preprocess(text_field=parameters.excel_column_with_text_data, preprocessed_data_file=parameters.preprocessed_data_file)
        logger.info("Checking generated data.")
        pickle_manager.check_data(parameters.preprocessed_data_file)
    else:
        if not isfile(parameters.preprocessed_data_file):
            logger.error("The indicated preprocessed data file does not exist.")
            quit()
    logger.info("Extracting features.")
    feature_extractor = FeatureExtractor(nltk_stop_words_package=parameters.nltk_stop_words_package, vectorizer_name=parameters.vectorizer, training_mode=parameters.training_mode, use_lda=parameters.use_lda, document_adjustment_code=parameters.document_adjustment_code, remove_adjectives=parameters.remove_adjectives, synonyms_file=parameters.synonyms_file, features_file=parameters.features_file)
    X, y, _lemmas = feature_extractor.generate_X_y(class_field=parameters.excel_column_with_classification_data, preprocessed_data_file=parameters.preprocessed_data_file)
    logger.info("Splitting dataset into training and test subsets.")    
    train_test_split(y, parameters.test_subset_size, parameters.preprocessed_data_file, parameters.force_subsets_regeneration)
    logger.info("Running classifiers.")
    p = classifiers.Pipeline(parameters.classifiers, parameters.cross_validate)
    metadata = pickle_manager.get_docs_metadata(parameters.preprocessed_data_file)
    training_set_indexes = metadata['training_set_indexes'].tolist()
    test_set_indexes = metadata['test_set_indexes'].tolist()
    assert len(training_set_indexes) == len(set(training_set_indexes))
    assert len(test_set_indexes) == len(set(test_set_indexes))
    for elem in feature_extractor.to_remove:
        try:
            training_set_indexes.remove(elem)
        except ValueError:
            test_set_indexes.remove(elem)
    logger.info("Accuracies:")
    p.start(X, y, parameters.number_of_jobs, parameters.set_num_accepted_probs, training_set_indexes, test_set_indexes, parameters.resampling)
    logger.debug("Execution completed.")

Exemple #3

0

Afficher le fichier

def Word2Vec_Model(df, classifier,sampling_method):
    print(classifier)
    print('\n')
    GloveModel = load_glove_model("glove.twitter.27B.100d.txt")  

    trainTweet,testTweet,trainLabel,testLabel = train_test_split(df,sampling_method)        
    pipeline = Pipeline([('classifier',classifier)])
    
    global count_total, count_in, count_out
    global out_words_list
    count_total, count_in, count_out = 0, 0, 0 
    out_words_list = []    
    
    trainVec = get_tweet_vectors(trainTweet, GloveModel, 100) # it has to be same as read in txt dimension which is 200.
    testVec = get_tweet_vectors(testTweet, GloveModel, 100) # glove.twitter.27B.200d.txt
    
    print("Glove word embedding statistic\n", "count_total: %d/" %count_total, "count_in: %d/" %count_in, "count_out: %d/" %count_out)
    print("Number of unique words without embedding: %d" %len(set(out_words_list)))
    print("Words without embedding: \n", set(out_words_list))
    
    t0 = time() 
    pipeline.fit(trainVec,trainLabel)  
    y_pred = pipeline.predict(testVec)
    train_test_time = time() - t0    
                        
    accuracy = accuracy_score(testLabel,y_pred)
    confusion_result = confusion_matrix(y_pred,testLabel)
    
    print("accuracy score: {0:.2f}%".format(accuracy*100))
    print("train and test time: {0:.2f}s".format(train_test_time))
    print('-'*80)
    print ("Confusion Matrix\n")
    print (pd.DataFrame(confusion_result))
    print('-'*80)
    print ("Classification Report\n")
    print (classification_report(testLabel,y_pred))

Exemple #4

0

Afficher le fichier

Fichier : train_decoder.py Projet : ostwind/ProteinLatentRep

def main(args):
    # Make a directory to save models
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Preprocess the RRM data
    vocab, df_aligned = preprocess(preprocessed=args.preprocessed,
                                   RRM_path=args.aligned_RRM_path,
                                   output_path=args.processed_RRM_path,
                                   sep=args.sep)
    df_aligned = train_test_split(df_aligned)
    with open(os.path.join(args.model_path, 'vocab.pkl'), 'wb') as f:
        pickle.dump(vocab, f)

    # Prepare the training and validation sets
    train_index = pd.read_csv('../data/train_index.csv', header=None).iloc[:,
                                                                           0]
    train_loader = RRM_Sequence(df_aligned.loc[train_index, :], vocab)
    train_loader = DataLoader(train_loader,
                              batch_size=args.batch_size,
                              shuffle=True,
                              collate_fn=collate_fn)

    val_index = pd.read_csv('../data/val_index.csv', header=None).iloc[:, 0]
    val_loader = RRM_Sequence(df_aligned.loc[val_index, :], vocab)
    val_loader = DataLoader(val_loader,
                            batch_size=args.batch_size,
                            shuffle=True,
                            collate_fn=collate_fn)

    # Define the models
    encoder = ResNetEncoder(df_aligned.shape[1], len(vocab), args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Use CUDA if available
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Define the loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the models
    total_step = len(train_loader)
    val_loss_history = []
    for epoch_num, epoch in enumerate(range(args.num_epochs)):
        for batch_idx, (names, rrms_aligned, rrms_unaligned,
                        lengths) in enumerate(train_loader):
            rrms_aligned = to_var(rrms_aligned)
            rrms_unaligned = to_var(rrms_unaligned)
            targets = pack_padded_sequence(rrms_unaligned,
                                           lengths,
                                           batch_first=True)[0]

            # Forward, backward, and optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(rrms_aligned)
            outputs = decoder(features, rrms_unaligned, lengths)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # Print log info
            if (batch_idx + 1) % args.log_step == 0:
                val_loss = validate(val_loader, encoder, decoder, criterion)
                val_loss_history.append(val_loss)
                print(
                    'Epoch [%d/%d], Step [%d/%d], Training Loss: %.4f, Validation loss: %.4f'
                    % (epoch + 1, args.num_epochs, batch_idx + 1, total_step,
                       loss.data[0], val_loss))
                stop = early_stop(val_loss_history)
                if stop:
                    print(
                        '=== Early stopping === Validation loss not improving significantly ==='
                    )
                    torch.save(
                        decoder.state_dict(),
                        os.path.join(
                            args.model_path,
                            'decoder-anneal%s-%dcolumns-%d-%d.pkl' %
                            (args.learning_rate_annealing, df_aligned.shape[1],
                             epoch + 1, batch_idx + 1)))
                    torch.save(
                        encoder.state_dict(),
                        os.path.join(
                            args.model_path,
                            'encoder-anneal%s-%dcolumns-%d-%d.pkl' %
                            (args.learning_rate_annealing, df_aligned.shape[1],
                             epoch + 1, batch_idx + 1)))
                    break

            # Save the models
            if (batch_idx + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(
                        args.model_path,
                        'decoder-anneal%s-%dcolumns-%d-%d.pkl' %
                        (args.learning_rate_annealing, df_aligned.shape[1],
                         epoch + 1, batch_idx + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(
                        args.model_path,
                        'encoder-anneal%s-%dcolumns-%d-%d.pkl' %
                        (args.learning_rate_annealing, df_aligned.shape[1],
                         epoch + 1, batch_idx + 1)))

        # Decay the learning rate if specified
        if args.learning_rate_annealing:
            adjust_learning_rate(optimizer, epoch + 1)

        if stop:
            break

Exemple #5

0

Afficher le fichier

cross_corrs = np.zeros(st.nclusters)

for i, cl in enumerate(st.clusters):
    sta = data['stas'][i]

    spikes = st.binnedspiketimes(i)

    import time
    start = time.time()

    # Calculate the contrast for each cell's receptive field
    stimulus[-1, :] = st.contrast_signal_cell(i)

    sp_tr, sp_te, stim_tr, stim_te = train_test_split(spikes, stimulus,
                                                      test_size=val_split_size,
                                                      split_pos=val_split_pos)

    res = gqm.minimize_loglikelihood(np.zeros((stimdim, fl)),
                                     np.zeros((stimdim, fl, fl)), 0,
                                     stim_tr,
                                     st.frame_duration,
                                     sp_tr,
                                     minimize_disp=True,
                                     method='BFGS')
    elapsed = time.time()-start

    print(f'Time elapsed: {elapsed/60:6.1f} mins for cell {i}')
    k_out, Q_out, mu_out = gqm.splitpars(res.x)
    kall[i, ...] = k_out
    Qall[i, ...] = Q_out

Exemple #6

0

Afficher le fichier

 def testImages():
     import sys
     import numpy
     sys.path.append("./preprocess/general_preprocessing")
     from train_test_split import train_test_split
     train_test_split(".\datasets\several_faces_dataset")

Exemple #7

0

Afficher le fichier

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from train_test_split import train_test_split
from LinearRegression import LinearRegression
boston = datasets.load_boston()

X = boston.data
y = boston.target

X = X[y < 50.0]
y = y[y < 50.0]

X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)
reg = LinearRegression()
reg.fit_normal(X_train, y_train)
print(reg.interception_)
print(reg.coef_)
print(reg.score(X_test, y_test))

Exemple #8

0

Afficher le fichier

from resizing_images import resize_images
from train_test_split import train_test_split
from generating_facemarks import collect_facemarks
from read_write import write_csv
from read_write import write_npy

import configparser

cf = configparser.ConfigParser()
cf.read("./config.ini")

if __name__ == "__main__":
    #split the train and test data // trData,clPrtestData,openPrTestData
    print("Splitting the data...")
    dataPaths = []
    dataPaths = train_test_split(cf.get("preprocess", "preprocessDataset"))
    print("DONE")
    #resize the data
    print("Resizing the images...")
    print("!ALERT!:resizing inplace!!!")
    dimensionsXY = cf.getint("preprocess", "forcedImageSizeXY")

    #uncomment if resize needed
    #resize_images(dataPaths,dimensionsXY,dimensionsXY)
    #augment with facemarks
    #NOTE: we remove the images that the model fails to produce facemarks to
    print("Adding facemarks to the dataset")
    collect_facemarks(dataPaths)

    print("DONE")
    print("Saving the data...")

Exemple #9

0

Afficher le fichier

kall = np.zeros((st.nclusters, 2, xval_splits, st.filter_length))
muall = np.zeros((st.nclusters, xval_splits))

frs = np.zeros((all_spikes.shape[0], int(all_spikes.shape[-1]/xval_splits)))

cross_corrs = np.zeros((st.nclusters, xval_splits))

t = np.linspace(0, st.filter_length*st.frame_duration*1000, st.filter_length)
stimulus = st.bgsteps

plotlabels = ['Motion X', 'Motion Y']

for i, cluster in enumerate(st.clusters):
    for xvi in range(xval_splits):
        sp_tr, sp_te, stim_tr, stim_te = train_test_split(all_spikes[i], stimulus,
                                                          test_size=xval_fraction,
                                                          split_pos=xval_fraction*xvi)

        res = glm.minimize_loglhd(np.zeros((2, st.filter_length)), 0,
                                  stim_tr,
                                  st.frame_duration,
                                  sp_tr,
                                  usegrad=True,
                                  method='BFGS')
        if not res['success']:
            print(i, 'did not complete successfully.')
    #    kall[i,  ...] = res['x'][:-1]
    #    muall[i] = res['x'][-1]
        kall[i, :, xvi, ...], muall[i, xvi] = glm.splitpars(res['x'])

        frs[i, :] = glm.glm_neuron(kall[i, :, xvi, ...],

Exemple #10

0

Afficher le fichier

import matplotlib.pyplot as plt
from sklearn import datasets
from train_test_split import train_test_split
from SimpleLinearRegression import SimpleLinearRegression2
import metrics as me
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import sklearn.metrics as skm

boston = datasets.load_boston()
# print(boston.DESCR)
x = boston.data[:, 5]
y = boston.target
x = x[y < 50.0]
y = y[y < 50.0]
x_train, x_test, y_train, y_test = train_test_split(x, y, seed=666)
# print(x_train.shape)
# print(x_test.shape)
reg = SimpleLinearRegression2()
reg.fit(x_train, y_train)
# print(reg.a_)
# print(reg.b_)

y_predict = reg.predict(x_test)
# print(y_predict)
"""
# MSE误差
mse_test = np.sum((y_predict - y_test) ** 2) / len(y_test)
# mse_test = me.mean_squared_error(y_test, y_predict)
print(mse_test)

Exemple #11

0

Afficher le fichier

from plot import plot_results, get_z
from prepare_data import prepare_data
from train_test_split import train_test_split
import parameters

dataset_names = ['chips', 'geyser']
dataset_name = dataset_names[0]

filename = 'data/{}.csv'.format(dataset_name)

dataset, features, labels = prepare_data(filename, normalization=False)

train_test_ratio = 0.8

train_set, train_features, train_labels, test_set, test_features, test_labels = train_test_split(dataset, train_test_ratio)

results = []

for n_estimators in parameters.n_estimators:
    for learning_rate in parameters.learning_rate:
        model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)

        model.fit(train_features, train_labels)

        predicted_labels = model.predict(test_features)

        f_score = f1_score(test_labels, predicted_labels, average='binary', pos_label='P')

        results.append({'f_score': f_score, 'n_estimators': n_estimators, 'learning_rate': learning_rate})

Exemple #12

0

Afficher le fichier

# define the test location, increment, depth interval
test_loc = [400, 700]  #400,700 550,900

test_inc = 100

dz = 0.1524

plot_well_feature(x_trainwell, test_loc, test_inc, dz)

plot_well_target(y_trainwell, test_loc, test_inc, dz)

#%%
from train_test_split import train_test_split

X_train, Y_train, X_test, Y_test = train_test_split(x_trainwell, y_trainwell,
                                                    test_loc, test_inc)

pd_data = pd.DataFrame(data=X_train, columns=features)

g = sns.pairplot(pd_data,
                 corner=True,
                 markers="o",
                 plot_kws=dict(s=5, edgecolor="b", linewidth=1))

g.fig.set_figwidth(8)
g.fig.set_figheight(8)
plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

#%%
# standize the matrix for training data
scaler = StandardScaler()