Beispiel #1
0
def single_word_file_predict(data_filename, predict_filename):
    print 'Predict file ' + data_filename
    sentence_list = []
    words_list = []
    labels_list = []
    with open(data_filename, mode='r') as data_file:
        for line in data_file:
            word_list, label_list = data_utils.split(line)
            if word_list and label_list:
                sentence_list.append(''.join(word_list))
                words_list.append(' '.join(word_list))
                labels_list.append(' '.join(label_list))
    predict_labels_list = words_predict(words_list)
    word_predict_label_list = []
    word_category_list = []
    word_predict_category_list = []
    for (words, labels, predict_labels) in zip(words_list, labels_list,
                                               predict_labels_list):
        word_list = words.split()
        label_list = labels.split()
        predict_label_list = predict_labels.split()
        word_predict_label = ' '.join([
            word + '/' + predict_label
            for (word, predict_label) in zip(word_list, predict_label_list)
        ])
        word_predict_label_list.append(word_predict_label)
        # merge label
        merge_word_list, merge_label_list = data_utils.merge_label(
            word_list, label_list)
        word_category = ' '.join([
            word + '/' + label
            for (word, label) in zip(merge_word_list, merge_label_list)
            if label != 'O'
        ])
        word_category_list.append(word_category)
        # merge predict label
        merge_predict_word_list, merge_predict_label_list = data_utils.merge_label(
            word_list, predict_label_list)
        word_predict_category = ' '.join([
            predict_word + '/' + predict_label
            for (predict_word, predict_label
                 ) in zip(merge_predict_word_list, merge_predict_label_list)
            if predict_label != 'O'
        ])
        word_predict_category_list.append(word_predict_category)
    with open(predict_filename, mode='w') as predict_file:
        for (sentence, word_predict_label, word_category, word_predict_category) in \
            zip(sentence_list, word_predict_label_list, word_category_list, word_predict_category_list):
            predict_file.write('Passage: ' + sentence + '\n')
            predict_file.write('SinglePredict: ' + word_predict_label + '\n')
            predict_file.write('Merge: ' + word_category + '\n')
            predict_file.write('MergePredict: ' + word_predict_category + '\n')
            predict_file.write('\n')
Beispiel #2
0
def train_svm(dataset_loader, test_points, data_limit=0):
    input_, output = get_data_up_to_limit(dataset_loader, data_limit)

    input_, output = data_utils.construct_one_vs_all(input_, output, 0)
    (input_train, input_test, output_train,
     output_test) = data_utils.split(input_, output, test_points)
    #Run svm
    svm = SVM()
    svm.give_training_data(input_train, output_train)
    svm.train()

    svm.give_test_data(input_test, output_test)
    svm.analyze()
Beispiel #3
0
def file_predict(data_filename, predict_filename):
    print 'Predict file ' + data_filename
    words_list = []
    labels_list = []
    with open(data_filename, mode='r') as data_file:
        for line in data_file:
            word_list, label_list = data_utils.split(line)
            if word_list and label_list:
                words_list.append(' '.join(word_list))
                labels_list.append(' '.join(label_list))
    predict_labels_list = words_predict(words_list)
    with open(predict_filename, mode='w') as predict_file:
        for (words, labels, predict_labels) in zip(words_list, labels_list,
                                                   predict_labels_list):
            predict_file.write('Passage: ' + words + '\n')
            predict_file.write('Label: ' + labels + '\n')
            predict_file.write('PredictLabel: ' + predict_labels + '\n' + '\n')
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Split train.csv into train, dev, and test splits. Specify dev and validation set sizes with args, the remainder is used for training.'
    )
    parser.add_argument(
        '--dataset-file',
        required=True,
        help='path to the train.csv file containing the quora training data')
    parser.add_argument('--ndev',
                        type=int,
                        default=1e4,
                        help='size of dev set to create')
    parser.add_argument('--nvalid',
                        type=int,
                        default=5e4,
                        help='size of validation set to create')
    parser.add_argument(
        '--output-dir',
        required=True,
        help='directory to which to write train.csv, dev.csv, and valid.csv')
    parser.add_argument(
        '--seed',
        type=int,
        help=
        'optional random seed to have reproducibility between multiple uses of this tool'
    )
    args = parser.parse_args()

    data = du.load_csv(args.dataset_file)
    shuffled = du.shuffle(data, args.seed)

    ntrain = len(data) - args.ndev - args.nvalid
    train, dev, valid = du.split(shuffled, ntrain, args.ndev, args.nvalid)

    du.write_csv(train, os.path.join(args.output_dir, 'train.csv'))
    du.write_csv(dev, os.path.join(args.output_dir, 'dev.csv'))
    du.write_csv(valid, os.path.join(args.output_dir, 'valid.csv'))
Beispiel #5
0
def model_memorisation(identifier, epoch, max_samples=2000, tstr=False):
    """
    Compare samples from a model against training set and validation set in mmd
    """
    if tstr:
        print('Loading data from TSTR experiment (not sampling from model)')
        # load pre-generated samples
        synth_data = np.load('./experiments/tstr/' + identifier + '_' +
                             str(epoch) + '.data.npy').item()
        model_samples = synth_data['samples']
        synth_labels = synth_data['labels']
        # load real data used in that experiment
        real_data = np.load('./experiments/data/' + identifier +
                            '.data.npy').item()
        real_samples = real_data['samples']
        train = real_samples['train']
        test = real_samples['test']
        n_samples = test.shape[0]
        if model_samples.shape[0] > n_samples:
            model_samples = np.random.permutation(model_samples)[:n_samples]
        print('Data loaded successfully!')
    else:
        if identifier == 'cristobal_eICU':
            model_samples = pickle.load(open('REDACTED', 'rb'))
            samples, labels = data_utils.eICU_task()
            train = samples['train'].reshape(-1, 16, 4)
            vali = samples['vali'].reshape(-1, 16, 4)
            test = samples['test'].reshape(-1, 16, 4)
            #train_targets = labels['train']
            #vali_targets = labels['vali']
            #test_targets = labels['test']
            train, vali, test = data_utils.scale_data(train, vali, test)
            n_samples = test.shape[0]
            if n_samples > max_samples:
                n_samples = max_samples
                test = np.random.permutation(test)[:n_samples]
            if model_samples.shape[0] > n_samples:
                model_samples = np.random.permutation(
                    model_samples)[:n_samples]
        elif identifier == 'cristobal_MNIST':
            the_dir = 'REDACTED'
            # pick a random one
            which = np.random.choice(['NEW_OK_', '_r4', '_r5', '_r6', '_r7'])
            model_samples, model_labels = pickle.load(
                open(
                    the_dir +
                    'synth_mnist_minist_cdgan_1_2_100_multivar_14_nolr_rdim3_0_2_'
                    + which + '_190.pk', 'rb'))
            # get test and train...
            # (generated with fixed seed...)
            mnist_resized_dim = 14
            samples, labels = data_utils.load_resized_mnist(mnist_resized_dim)
            proportions = [0.6, 0.2, 0.2]
            train, vali, test, labels_split = data_utils.split(
                samples, labels=labels, random_seed=1, proportions=proportions)
            np.random.seed()
            train = train.reshape(-1, 14, 14)
            test = test.reshape(-1, 14, 14)
            vali = vali.reshape(-1, 14, 14)
            n_samples = test.shape[0]
            if n_samples > max_samples:
                n_samples = max_samples
                test = np.random.permutation(test)[:n_samples]
            if model_samples.shape[0] > n_samples:
                model_samples = np.random.permutation(
                    model_samples)[:n_samples]
        else:
            settings = json.load(
                open('./experiments/settings/' + identifier + '.txt', 'r'))
            # get the test, train sets
            data = np.load('./experiments/data/' + identifier +
                           '.data.npy').item()
            train = data['samples']['train']
            test = data['samples']['test']
            n_samples = test.shape[0]
            if n_samples > max_samples:
                n_samples = max_samples
                test = np.random.permutation(test)[:n_samples]
            model_samples = model.sample_trained_model(settings, epoch,
                                                       n_samples)
    all_samples = np.vstack([train, test, model_samples])
    heuristic_sigma = mmd.median_pairwise_distance(all_samples)
    print('heuristic sigma:', heuristic_sigma)
    pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test(
        model_samples,
        test,
        np.random.permutation(train)[:n_samples],
        sigma=heuristic_sigma,
        computeMMDs=False)
    #pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test(model_samples, np.random.permutation(train)[:n_samples], test, sigma=heuristic_sigma, computeMMDs=False)
    #    if pvalue < 0.05:
    #        print('At confidence level 0.05, we reject the null hypothesis that MMDXY <= MMDXZ, and conclude that the test data has a smaller MMD with the true data than the generated data')
    # the function takes (X, Y, Z) as its first arguments, it's testing if MMDXY (i.e. MMD between model and train) is less than MMDXZ (MMd between model and test)
    #    else:
    #        print('We have failed to reject the null hypothesis that MMDXY <= MMDXZ, and cannot conclu#de that the test data has a smaller MMD with the true data than the generated data')
    return pvalue, tstat, sigma
Beispiel #6
0
    ]
    data_settings = dict(
        (k, settings[k]) for k in data_vars if k in settings.keys())
    samples, pdf, labels = data_utils.get_data(settings['data'], data_settings)
    if 'multivariate_mnist' in settings and settings['multivariate_mnist']:
        seq_length = samples.shape[1]
        samples = samples.reshape(-1, int(np.sqrt(seq_length)),
                                  int(np.sqrt(seq_length)))
    if 'normalise' in settings and settings[
            'normalise']:  # TODO this is a mess, fix
        print(settings['normalise'])
        norm = True
    else:
        norm = False
    if labels is None:
        train, vali, test = data_utils.split(samples, [0.6, 0.2, 0.2],
                                             normalise=norm)
        train_labels, vali_labels, test_labels = None, None, None
    else:
        train, vali, test, labels_list = data_utils.split(samples,
                                                          [0.6, 0.2, 0.2],
                                                          normalise=norm,
                                                          labels=labels)
        train_labels, vali_labels, test_labels = labels_list

labels = dict()
labels['train'], labels['vali'], labels[
    'test'] = train_labels, vali_labels, test_labels
del train_labels
del vali_labels
del test_labels
Beispiel #7
0
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(), normalize
])

transforms_test = transforms.Compose([transforms.ToTensor(), normalize])

best_acc = 0
start_epoch = 0
trainset = torchvision.datasets.CIFAR10(root='./data',
                                        train=True,
                                        download=True,
                                        transform=None)

train_data, train_labels = trainset.data, np.squeeze(trainset.targets)

unlabeled_idxs, labeled_idxs, _ = du.split(trainset, sn=5000, v_sn=0)

train_datasets = DT(trainData=train_data[labeled_idxs, :, :, :],
                    trainLabel=train_labels[labeled_idxs],
                    transform=transforms_train)

trainloader = torch.utils.data.DataLoader(train_datasets,
                                          batch_size=100,
                                          shuffle=True,
                                          num_workers=4)

testset = torchvision.datasets.CIFAR10(root='./data',
                                       train=False,
                                       download=True,
                                       transform=transforms_test)
testloader = torch.utils.data.DataLoader(testset,
Beispiel #8
0
samples = []
#Load the stock data by each row
#Shape is num_samples * sequence_length(each stocks time series) * number of signal channels (high, open, close, low, volume at each time step)

#Everything needs to be an np.ndarray
for stock, df_stock in df.groupby('Name'):
    sequence = []
    for index, row in df_stock.iterrows():
        #print(list(row[1:6]))
        sequence.append(np.array(list(row[1:6])))
    sequence = np.array(sequence)
    if (len(sequence) == 1259):
        samples.append(sequence)
samples = np.array(samples)
#split into train,vali,test split. Normalize/scale don't work correctly
train, vali, test = data_utils.split(samples, [0.6, 0.2, 0.2], normalise=False)
train_labels, vali_labels, test_labels = None, None, None
#Check that values scale correctly
print("Before scaling")
print(train)
for i in range(len(train)):
    train[i] = scale_linear_bycolumn(train[i])
for i in range(len(vali)):
    vali[i] = scale_linear_bycolumn(vali[i])
for i in range(len(test)):
    test[i] = scale_linear_bycolumn(test[i])
print("After scaling")
print(train)
labels = dict()
labels['train'], labels['vali'], labels[
    'test'] = train_labels, vali_labels, test_labels