Example #1
0
    orthographic_insertion_place = 'NONE' if len(
        sys.argv) <= 5 else sys.argv[5]
    orthographic_insertion_type = 'ONE_HOT' if len(
        sys.argv) <= 6 else sys.argv[6]

    orthographic_insertion_place = OrthographicConcatePlace[
        orthographic_insertion_place.upper()]
    orthographic_insertion_type = OrthographicFeatureForm[
        orthographic_insertion_type.upper()]

    p = PreprocessData(dataset_type='wsj')

    files = p.preProcessDirectory(dataset_path)

    if split_type == 'standard':
        train_files, val_files, test_files = p.get_standard_split(files)
    else:
        shuffle(files)
        train_files, test_val_files = p.split_data(files, 0.8)
        test_files, val_files = p.split_data(test_val_files, 0.5)

    train_mat = p.get_raw_data(train_files, 'train')
    val_mat = p.get_raw_data(val_files, 'validation')
    test_mat = p.get_raw_data(test_files, 'test')

    X_train, y_train, P_train, S_train, XC_train, XN_train, XH_train, _ = p.get_processed_data(
        train_mat, MAX_LENGTH)
    X_val, y_val, P_val, S_val, XC_val, XN_val, XH_val, _ = p.get_processed_data(
        val_mat, MAX_LENGTH)
    X_test, y_test, P_test, S_test, XC_test, XN_test, XH_test, _ = p.get_processed_data(
        test_mat, MAX_LENGTH)
Example #2
0
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)

    # specify train or test
    experiment_type = sys.argv[4]

    if len(sys.argv) >= 6:
        MAX_LENGTH = int(sys.argv[6])
    print "Using max length: {0}".format(MAX_LENGTH)

    # initialize a new PreprocessData instance
    p = PreprocessData(sys.argv[5])
    # split them into training, validation, and test files
    # these will be saved in train_dir/train.txt, train_dir/val.txt, train_dir/test.txt
    train_file, val_file, test_file = p.get_standard_split(
        clickbait_raw_path, normal_raw_path, train_dir)

    train_mat = p.process_single_file(train_file, 'train')
    val_mat = p.process_single_file(val_file, 'validation')
    test_mat = p.process_single_file(test_file, 'test')

    X_train, Y_train = p.get_processed_data(train_mat, MAX_LENGTH)
    X_val, Y_val = p.get_processed_data(val_mat, MAX_LENGTH)
    X_test, Y_test = p.get_processed_data(test_mat, MAX_LENGTH)

    if experiment_type == 'train':
        if os.path.exists(train_dir):
            shutil.rmtree(train_dir)
        os.mkdir(train_dir)
        train(X_train, Y_train, X_val, Y_val, train_dir)
    else: