def main():
    data = utils.read_data_from_csv('data/winequality-red.csv')

    for attribute in data[0].keys():
        for name, func in BIN_FUNCTIONS.iteritems():
            plot_histogram(data, attribute, func, name)

    data_frame = DataFrame(data)
    plot_scatter_matrix(data_frame)
    plot_parallel_coordinates(data_frame)

    plot_pca_projection(data)
    plot_pca_projection(data, normalized = True)

    plot_mds(data)

    data_frame.corr(method='pearson').to_csv('build/pearson.csv')
    data_frame.corr(method='kendall').to_csv('build/kendall.csv')
Esempio n. 2
0
def main():
    data = utils.read_data_from_csv('data/winequality-red.csv')

    for attribute in data[0].keys():
        for name, func in BIN_FUNCTIONS.iteritems():
            plot_histogram(data, attribute, func, name)

    data_frame = DataFrame(data)
    plot_scatter_matrix(data_frame)
    plot_parallel_coordinates(data_frame)

    plot_pca_projection(data)
    plot_pca_projection(data, normalized=True)

    plot_mds(data)

    data_frame.corr(method='pearson').to_csv('build/pearson.csv')
    data_frame.corr(method='kendall').to_csv('build/kendall.csv')
import torch
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification, BertTokenizer
from tqdm import trange
import tarfile

import config
from utils import read_data_from_csv, prepare_data_bert

if __name__ == '__main__':
    # train_data is the same thing as the train_data and test_data outputs from preprocess_data, just pickled
    # This helps avoid having to run the preprocess_data script everytime
    print("Loading data...")
    if config.EQUALIZE_CLASS_COUNTS is True:
        print("Equalizing class counts!")
    train_data = read_data_from_csv(filename=config.CSV_FILENAME_TRAIN,
                                    train=True,
                                    num_records=config.BERT_NUM_RECORDS,
                                    equalize=config.EQUALIZE_CLASS_COUNTS)

    print("Loading models...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True)
    # initialize the model with 2 output classes
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                          num_labels=2)
    device = torch.device(config.DEVICE)
    model = model.to(device)
    # initialize the optimzier
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_params = [{
        'params':
    print "In-sample variance: %f" % numpy.var(in_sample_errors)
    print "In-sample mean: %f" % numpy.mean(in_sample_errors)

    out_sample_errors = calculate_linear_errors(model, model.predict(X_test), Y_test)
    plot_errors(out_sample_errors, 'Absolute error (out-sample)')
    utils.save_plot(pyplot, name = "build/%s_out_sample.png" % type)

    print "Out-of-sample variance: %0.3f" % numpy.var(out_sample_errors)
    print "Out-of-sample mean: %0.3f" % numpy.mean(out_sample_errors)

    return (numpy.mean(out_sample_errors) + numpy.mean(in_sample_errors)) / 2

if __name__ == '__main__':
    dataset = utils.dict_to_numpy(
        utils.read_data_from_csv('data/winequality-red.csv'),
        columns_to_exclude = ['fixed acidity', 'chlorides', 'free sulfur dioxide'])

    data = dataset['data']
    target = dataset['target']
    attributes = dataset['attributes']

    X_train = data[:-100]
    X_test = data[-100:]
    Y_train = target[:-100]
    Y_test = target[-100:]

    print 'Linear regression'
    regression_model = linear_model.LinearRegression()
    regression(regression_model, X_train, X_test, Y_train, Y_test, 'linear')
    print
Esempio n. 5
0
import config
from utils import read_data_from_csv, prepare_data_bert, print_evaluation_score


def flat_accuracy(preds, labels):
    accuracy = np.sum(preds == labels) / len(labels)
    return accuracy


if __name__ == '__main__':
    print("Loading data...")

    if config.EQUALIZE_CLASS_COUNTS is True:
        print("Equalized class counts!")
    test_data = read_data_from_csv(filename=config.CSV_FILENAME_TEST,
                                   train=False)

    print("Loading models...")
    device = torch.device(config.DEVICE)
    model = BertForSequenceClassification.from_pretrained(config.BERT_TAR_FILE,
                                                          num_labels=2)
    tokenizer = BertTokenizer.from_pretrained(config.BERT_VOCAB_FILE,
                                              do_lower_case=True)

    print("Preparing testing data...")
    max_sent_len = config.BERT_MAX_SENT_LEN
    test_dataloader = prepare_data_bert(test_data, tokenizer, max_sent_len)

    print("Evaluating the model...")
    model.eval()
    model.to(device)