Ejemplo n.º 1
0
def main():
    if len(sys.argv) < 2:
        print("Behavioral Cloning\n", "Usage: python3 main.py config.json")
    else:
        config_file = sys.argv[1]
        with open(config_file) as yaml_file:
            # The FullLoader parameter handles the conversion from YAML
            # scalar values to Python the dictionary format
            configs = yaml.load(yaml_file, Loader=yaml.FullLoader)

            # Data configurations
            data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), configs["data_path"])
            labels_file = configs["labels_file"]
            skip_header = bool(configs["skipHeader"])
            use_side_images = bool(configs["useSideImages"])
            correction_factor = float(configs["correctionFactor"])
            train_valid_split = float(configs["trainValidSplit"])
            do_flip = bool(configs["doFlip"])

            # Training configurations
            top_crop = int(configs["topCrop"])
            bottom_crop = int(configs["bottomCrop"])
            batch_size = int(configs["batchSize"])
            epochs = int(configs["epochs"])
            loss = configs["loss"]
            optimizer = configs["optimizer"]
            verbose = int(configs["verbose"])
            model_name = configs["modelName"]
            output_dir = configs["outputDir"]

            # Init Preprocessing
            preprocess = PreprocessData(data_path, labels_file, correction_factor, skip_header, use_side_images,
                                        train_valid_split, do_flip)

            # Preprocess data and extract training and validation samples
            train_samples, validation_samples = preprocess.splitData()

            # Initialize train and validation generators
            train_generator = preprocess.generator(train_samples, batch_size=batch_size)
            validation_generator = preprocess.generator(validation_samples, batch_size=batch_size)

            # Get image shape
            img_shape = preprocess.get_image_shape()

            # Initialize training network
            network = Model(img_shape, top_crop, bottom_crop, batch_size, epochs, loss, optimizer, verbose,
                            train_generator,
                            validation_generator, train_samples, validation_samples, model_name)

            model = network.create_model()

            # Initialize visualization
            visualize = Visualization(model, output_dir)
            visualize.visualize_model()

            network.get_summary(model)

            results = network.train_model(model)

            visualize.save_plots(results)
Ejemplo n.º 2
0
def prepare_data(data_size, max_len_1, max_len_2, preprocess, balance_data):
    if preprocess:
        pd = PreprocessData(data_size, max_len_1, max_len_2)
        pd.storeData()

    data_padded = pickle.load(
        open("data/data_emb200_pad_%d.p" % data_size, "rb"))
    print(data_padded.columns)

    if balance_data:
        data = balance_model(data_padded)
    else:
        data = data_padded

    # Split to train, dev and test pandas dataframe
    # print "Splitting embedded data..."
    train, dev, test = df_splitter(data)

    # Obtain train, dev, test data
    train_X1, train_X2, train_label = get_x_y(train)
    dev_X1, dev_X2, dev_label = get_x_y(dev)
    test_X1, test_X2, test_label = get_x_y(test)

    # Normalize x1, x2
    train_X1, dev_X1, test_X1 = standardize(train_X1, dev_X1, test_X1)
    train_X2, dev_X2, test_X2 = standardize(train_X2, dev_X2, test_X2)

    print('train non-duplicate prop:', 1 - np.mean(train_label))
    print('dev non-duplicate prop:', 1 - np.mean(dev_label))
    print('test non-duplicate prop:', 1 - np.mean(test_label))

    # Transform y to one hot vector
    n_labels = 2
    train_y = np.zeros((train_label.shape[0], n_labels))
    dev_y = np.zeros((dev_label.shape[0], n_labels))
    test_y = np.zeros((test_label.shape[0], n_labels))
    train_y[np.arange(train_label.shape[0]), train_label] = 1
    dev_y[np.arange(dev_label.shape[0]), dev_label] = 1
    test_y[np.arange(test_label.shape[0]), test_label] = 1

    return data_padded, train, train_X1, train_X2, train_y, dev_X1, dev_X2, dev_y, test_X1, test_X2, test_y
Ejemplo n.º 3
0
    dataset_path = sys.argv[1]
    train_dir = sys.argv[2]
    split_type = sys.argv[3]
    experiment_type = sys.argv[4]

    orthographic_insertion_place = 'NONE' if len(
        sys.argv) <= 5 else sys.argv[5]
    orthographic_insertion_type = 'ONE_HOT' if len(
        sys.argv) <= 6 else sys.argv[6]

    orthographic_insertion_place = OrthographicConcatePlace[
        orthographic_insertion_place.upper()]
    orthographic_insertion_type = OrthographicFeatureForm[
        orthographic_insertion_type.upper()]

    p = PreprocessData(dataset_type='wsj')

    files = p.preProcessDirectory(dataset_path)

    if split_type == 'standard':
        train_files, val_files, test_files = p.get_standard_split(files)
    else:
        shuffle(files)
        train_files, test_val_files = p.split_data(files, 0.8)
        test_files, val_files = p.split_data(test_val_files, 0.5)

    train_mat = p.get_raw_data(train_files, 'train')
    val_mat = p.get_raw_data(val_files, 'validation')
    test_mat = p.get_raw_data(test_files, 'test')

    X_train, y_train, P_train, S_train, XC_train, XN_train, XH_train, _ = p.get_processed_data(
Ejemplo n.º 4
0
    # specify directory where model is saved or loaded
    train_dir = sys.argv[3]

    if not os.path.exists(train_dir):
        os.makedirs(train_dir)

    # specify train or test
    experiment_type = sys.argv[4]

    if len(sys.argv) >= 6:
        MAX_LENGTH = int(sys.argv[6])
    print "Using max length: {0}".format(MAX_LENGTH)

    # initialize a new PreprocessData instance
    p = PreprocessData(sys.argv[5])
    # split them into training, validation, and test files
    # these will be saved in train_dir/train.txt, train_dir/val.txt, train_dir/test.txt
    train_file, val_file, test_file = p.get_standard_split(
        clickbait_raw_path, normal_raw_path, train_dir)

    train_mat = p.process_single_file(train_file, 'train')
    val_mat = p.process_single_file(val_file, 'validation')
    test_mat = p.process_single_file(test_file, 'test')

    X_train, Y_train = p.get_processed_data(train_mat, MAX_LENGTH)
    X_val, Y_val = p.get_processed_data(val_mat, MAX_LENGTH)
    X_test, Y_test = p.get_processed_data(test_mat, MAX_LENGTH)

    if experiment_type == 'train':
        if os.path.exists(train_dir):
Ejemplo n.º 5
0
				saver.restore(sess, ckpt.model_checkpoint_path)

				global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
			test_loss, test_accuracy, test_oov_accuracy = compute_summary_metrics(sess, m, sentence_words_test,
															   sentence_tags_test)
			print 'Test Accuracy: {:.3f}'.format(test_accuracy)
			print 'Test Loss: {:.3f}'.format(test_loss)
			print 'Test OOV Accuracy: {:.3f}'.format(test_oov_accuracy)

if __name__ == '__main__':
	dataset_path = sys.argv[1]
	train_dir = sys.argv[2]
	split_type = sys.argv[3]
	experiment_type = sys.argv[4]

	p = PreprocessData(dataset_type='wsj', model_type_=int(sys.argv[5]))

	files = p.preProcessDirectory(dataset_path)
	
	if split_type == 'standard':
		train_files, val_files, test_files = p.get_standard_split(files)
	else:
		shuffle(files)
		train_files, test_val_files = p.split_data(files, 0.8)
		test_files, val_files = p.split_data(test_val_files, 0.5)

	train_mat = p.get_raw_data(train_files, 'train')
	val_mat = p.get_raw_data(val_files, 'validation')
	test_mat = p.get_raw_data(test_files, 'test')

	X_train, y_train, _ = p.get_processed_data(train_mat, MAX_LENGTH)
args = parser.parse_args()

if args.load_data:
    _ = LoadData(data_path=args.data,
                 checkpoint_path=args.data_checkpoint,
                 train_file=args.dataset_name + '_' + 'train.csv',
                 dev_file=args.dataset_name + '_' + 'dev.csv')

# Create Checkpoint folders for in between sessions storage
create_checkpoint_paths(args.data_checkpoint)
create_checkpoint_paths(args.model_checkpoint)

data_preprocessor = PreprocessData(data_path=args.data_checkpoint,
                                   glove_size=args.glove_size,
                                   batch_size=args.batch_size,
                                   train_file='train1.csv',
                                   dev_file='dev1.csv')
                                   # train_file=args.dataset_name + '_' + 'train.csv',
                                   # dev_file=args.dataset_name + '_' + 'dev.csv')

save_vocab(data_preprocessor.WORDS.vocab, args.model_checkpoint + '/' + args.dataset_name + '_' + 'WORDS.vocab')
save_vocab(data_preprocessor.CHAR.vocab, args.model_checkpoint + '/' + args.dataset_name + '_' + 'CHAR.vocab')

# Initializing Bidaf Model
model = BidaF(data_preprocessor.WORDS,
              data_preprocessor.CHAR,
              char_embedding_size=args.char_embedding_size,
              char_conv_kernel_size=args.kernel_size,
              char_conv_channels_count=args.channels_count).to(device)
Ejemplo n.º 7
0
from preprocess import PreprocessData

if __name__ == '__main__':
    PreprocessData.runTxt()
    PreprocessData.runImgs()