def generator(samples, batch_size=32):
    num_samples = len(samples)
    while 1:  # Loop forever so the generator never terminates
        sklearn.utils.shuffle(samples)
        for offset in range(0, num_samples, batch_size):
            batch_samples = samples[offset:offset + batch_size]

            images = []
            angles = []
            for batch_sample in batch_samples:
                center_name = FLAGS.data_dir + '/IMG/' + batch_sample[0].split(
                    '/')[-1]
                center_image = preprocess_image(cv2.imread(center_name))
                center_angle = float(batch_sample[3])
                images.append(center_image)
                angles.append(center_angle)
                # data augmentation
                augmented_c_image, augmented_c_angle = augment_data(
                    center_image, center_angle)
                images.append(augmented_c_image)
                angles.append(augmented_c_angle)

                # add in left and right cameras' info
                left_name = FLAGS.data_dir + '/IMG/' + batch_sample[1].split(
                    '/')[-1]
                left_image = preprocess_image(cv2.imread(left_name))
                right_name = FLAGS.data_dir + '/IMG/' + batch_sample[2].split(
                    '/')[-1]
                right_image = preprocess_image(cv2.imread(right_name))
                # create adjusted steering measurements for the side camera images
                correction = 0.3  # this is a parameter to tune
                left_angle = center_angle + correction
                right_angle = center_angle - correction
                # add images and angles to data set
                images.extend([left_image, right_image])
                angles.extend([left_angle, right_angle])

                # data augmentation
                augmented_l_image, augmented_l_angle = augment_data(
                    left_image, left_angle)
                augmented_r_image, augmented_r_angle = augment_data(
                    right_image, right_angle)
                images.extend([augmented_l_image, augmented_r_image])
                angles.extend([augmented_l_angle, augmented_r_angle])

            # trim image to only see section with road
            X = np.array(images)
            y = np.array(angles)

            X, y = sklearn.utils.shuffle(X, y)

            yield X, y
Ejemplo n.º 2
0
def read_data(datadir):
    data = {}
    for filename in ['train', 'valid', 'test']:
        with open(os.path.join(datadir, filename + '.p'), 'rb') as f:
            data[filename] = pickle.load(f)

    X_train = data['train']['features']
    y_train = data['train']['labels']

    from collections import Counter
    count = Counter(y_train)

    target_num_image = count.most_common(1)[0][1] * 0.5
    new_X_train = np.empty(shape=(0, *X_train.shape[1:]), dtype=X_train.dtype)
    new_y_train = np.empty(shape=(0, ), dtype=y_train.dtype)
    for k, v in count.items():
        X_train_ = X_train[y_train == k]
        y_train_ = y_train[y_train == k]
        if v < target_num_image:
            print('Augmenting label=%d, count=%d, target_num_image=%d ...' %
                  (k, v, target_num_image))
            X_train_, y_train_ = utils.augment_data(X_train_, y_train_,
                                                    target_num_image)
        new_X_train = np.append(new_X_train, X_train_, axis=0)
        new_y_train = np.append(new_y_train, y_train_, axis=0)

    data['train']['features'] = np.array(new_X_train)
    data['train']['labels'] = np.array(new_y_train)

    return data
def data_pipeline(param):
    # load data
    X_train, y_train = utils.load_data('./data/train.p')
    X_valid, y_valid = utils.load_data('./data/valid.p')
    X_test, y_test = utils.load_data('./data/test.p')

    n_train = len(X_train)
    n_test = len(X_test)
    print("Number of training examples =", n_train)
    print("Number of testing examples =", n_test)

    image_shape = X_train.shape[1:]
    print("Image data shape =", image_shape)

    n_classes = np.max(y_train) + 1
    print("Number of classes =", n_classes)

    # data augmentation
    X_train, y_train = utils.augment_data(X_train, y_train, param)
    print("Number of augmented training examples =", len(X_train))
    print("Number of validation examples =", len(X_valid))

    # pre-process
    X_train = np.array(
        [utils.pre_process(X_train[i]) for i in range(len(X_train))],
        dtype=np.float32)
    X_valid = np.array(
        [utils.pre_process(X_valid[i]) for i in range(len(X_valid))],
        dtype=np.float32)
    X_test = np.array(
        [utils.pre_process(X_test[i]) for i in range(len(X_test))],
        dtype=np.float32)

    return X_train, y_train, X_valid, y_valid, X_test, y_test
def gen_new_train(param):
    # load data
    X_train, y_train = utils.load_data('./data/train.p')

    # data augmentation
    X_train, y_train = utils.augment_data(X_train, y_train, param)

    # pre-process
    X_train = np.array(
        [utils.pre_process(X_train[i]) for i in range(len(X_train))],
        dtype=np.float32)

    # one hot
    oh_y_train = utils.one_hot_encode(y_train)

    return X_train, y_train, oh_y_train
def iterate_minibatches(images, labels, batch_size):
    '''
    Function to create mini batches from the dataset of a certain batch size 
    :param images: numpy dataset
    :param labels: numpy dataset (same as images/volumes)
    :param batch_size: batch size
    :return: mini batches
    '''

    # ===========================
    # generate indices to randomly select slices in each minibatch
    # ===========================
    n_images = images.shape[0]
    random_indices = np.arange(n_images)
    np.random.shuffle(random_indices)

    # ===========================
    # using only a fraction of the batches in each epoch
    # ===========================
    for b_i in range(0, n_images, batch_size):

        if b_i + batch_size > n_images:
            continue

        # HDF5 requires indices to be in increasing order
        batch_indices = np.sort(random_indices[b_i:b_i + batch_size])

        X = images[batch_indices, ...]
        y = labels[batch_indices, ...]

        # ===========================
        # check if the velocity fields are to be used for the segmentation...
        # ===========================
        if exp_config.nchannels is 1:
            X = X[..., 0:1]

        # ===========================
        # augment the batch
        # ===========================
        if exp_config.da_ratio > 0.0 and exp_config.nchannels == 1:
            X, y = utils.augment_data(X, y, data_aug_ratio=exp_config.da_ratio)

        yield X, y
Ejemplo n.º 6
0
def read_data(data_path):
    samples = []
    log_path = os.path.join(os.path.abspath(data_path), 'driving_log.csv')
    with open(log_path) as csvfile:
        reader = csv.reader(csvfile)
        for line in reader:
            samples.append(line)

    images = []
    angles = []
    for line in samples:
        filename = FLAGS.data_dir + '/IMG/' + line[0].split('/')[-1]
        image = cv2.imread(filename)
        angle = float(line[3])
        image = preprocess_image(image)
        images.append(image)
        angles.append(angle)
        # data augmentation
        augmented_image, augmented_angle = augment_data(image, angle)
        images.append(augmented_image)
        angles.append(augmented_angle)
    return np.array(images), np.array(angles)
X_train, X_val, X_angle_train, X_angle_val, y_train, y_val = train_test_split( X, X_a, y, train_size = .8, random_state = SEED )
#X_train, X_val, y_train, y_val = train_test_split( X, y, train_size = .8, random_state = SEED )
callback_list = get_callbacks( WEIGHT_SAVE_PATH, 20 )

model = inception_stem()
start_time = time.time()

if USE_AUGMENTATION:
    image_augmentation = ImageDataGenerator( rotation_range = 20,
                                             horizontal_flip = True,
                                             vertical_flip = True,
                                             width_shift_range = .3,
                                             height_shift_range =.3,
                                             zoom_range = .1 )

    input_generator = augment_data( image_augmentation, X_train, X_angle_train, y_train, batch_size = BATCH_SIZE )

    model.fit_generator( input_generator, steps_per_epoch = 4096/BATCH_SIZE, epochs = EPOCHS,
                        callbacks = callback_list, verbose = 2, 
                        validation_data = augment_data(image_augmentation, X_val, X_angle_val, y_val, batch_size = BATCH_SIZE),
                        validation_steps = len(X_val)/BATCH_SIZE )

else: 
    # Just fit model to the given training data
    model.fit( X_train, y_train, batch_size = BATCH_SIZE, epochs = EPOCHS, verbose = 1, 
               validation_data = ([X_val, X_angle_val], y_val), callbacks = callback_list )

m, s = divmod( time.time() - start_time, 60 )
print( 'Model fitting done. Total time: {}m {}s'.format(int(m), int(s)) )

model.load_weights( WEIGHT_SAVE_PATH )
Ejemplo n.º 8
0
def run():

    logger.debug("Reading in the crispr dataset %s" % config.input_dataset)
    crispr = pd.read_csv(config.input_dataset)
    crispr['PAM'] = crispr['sequence'].str[-3:]
    if config.log_cen:
        crispr['essentiality'] = np.log(crispr['essentiality'] * 100 + 1)
    if config.with_pam:
        pam_code = 8
    else:
        pam_code = 0
    # scale_features
    process_features.scale_features(crispr)
    process_features.scale_output(crispr)
    logger.debug("Read in data successfully")

    logger.debug("Transforming data")
    X_for = crispr.loc[:, 'sequence'].apply(
        lambda seq: utils.split_seqs(seq[:config.seq_len]))
    X_rev = crispr.loc[:, 'sequence'].apply(
        lambda seq: utils.split_seqs(seq[config.seq_len - 1::-1]))
    X_cnn = crispr.loc[:, 'sequence'].apply(
        lambda seq: utils.split_seqs(seq[:config.seq_len], nt=1))
    X = pd.concat([X_for, X_rev, X_cnn], axis=1)
    logger.debug("Get sequence sucessfully")
    off_target_X = pd.DataFrame(np.empty(shape=[X_for.shape[0], 0]))
    # off_target_X = crispr.loc[:, 'sequence'].apply(lambda seq: utils.map_to_matrix(seq, 1, 22))
    # y = pd.DataFrame(np.abs(crispr[config.y].copy()) * 10)
    y = pd.DataFrame(crispr[config.y].copy() * 8)
    logger.debug("Transformed data successfully")

    logger.debug(
        "Starting to prepare for splitting dataset to training dataset and testing dataset based on genes"
    )
    logger.debug("Generating groups based on gene names")
    if config.group:
        crispr.loc[:, "group"] = pd.Categorical(crispr.loc[:, config.group])
    logger.debug("Generated groups information successfully")

    logger.debug("Splitting dataset")
    if os.path.exists(config.train_index) and os.path.exists(
            config.test_index):
        train_index = pickle.load(open(config.train_index, "rb"))
        test_index = pickle.load(open(config.test_index, "rb"))
    else:
        train_test_split = getattr(process_features,
                                   config.split_method + "_split",
                                   process_features.regular_split)
        train_index, test_index = train_test_split(crispr,
                                                   group_col=config.group_col,
                                                   n_split=max(
                                                       len(crispr) / 100, 10),
                                                   rd_state=7)

        with open(config.train_index, 'wb') as train_file:
            pickle.dump(train_index, train_file)
        with open(config.test_index, 'wb') as test_file:
            pickle.dump(test_index, test_file)

    if config.test_cellline:
        test_cellline_index = crispr[crispr['cellline'] ==
                                     config.test_cellline].index
        test_index = test_cellline_index.intersection(test_index)

    test_index_list = [
        x.index
        for _, x in crispr.loc[test_index, :].reset_index().groupby('group')
        if len(x)
    ] if config.test_method == 'group' else []
    logger.debug("Splitted data successfully")

    logger.debug("training data amounts: %s, testing data amounts: %s" %
                 (len(train_index), len(test_index)))
    x_train, x_test, y_train, y_test, off_target_X_train, off_target_X_test = \
                                       X.loc[train_index, :], X.loc[test_index, :], \
                                       y.loc[train_index, :], y.loc[test_index, :], \
                                       off_target_X.loc[train_index, :], off_target_X.loc[test_index, :]

    _, unique_train_index = np.unique(pd.concat([x_train, y_train], axis=1),
                                      return_index=True,
                                      axis=0)
    _, unique_test_index = np.unique(pd.concat([x_test, y_test], axis=1),
                                     return_index=True,
                                     axis=0)
    logger.debug(
        "after deduplication, training data amounts: %s, testing data amounts: %s"
        % (len(unique_train_index), len(unique_test_index)))
    logger.debug("Splitted dataset successfully")

    logger.debug("Generating one hot vector for categorical data")

    extra_crispr_df = crispr[config.extra_categorical_features +
                             config.extra_numerical_features]

    n_values = [pam_code] + ([2] * (len(config.extra_categorical_features) - 1)
                             ) if config.with_pam else [2] * len(
                                 config.extra_categorical_features)
    process_features.process_categorical_features(extra_crispr_df, n_values)
    extra_x_train, extra_x_test = extra_crispr_df.loc[
        train_index, :].values, extra_crispr_df.loc[test_index, :].values
    logger.debug("Generating on hot vector for categorical data successfully")

    logger.debug("Seperate forward and reverse seq")
    x_train = x_train.values
    for_input_len = config.seq_len - config.word_len + 1
    for_input, rev_input, for_cnn = x_train[:, :
                                            for_input_len], x_train[:,
                                                                    for_input_len:
                                                                    2 *
                                                                    for_input_len], x_train[:,
                                                                                            2
                                                                                            *
                                                                                            for_input_len:]
    x_test = x_test.values
    for_x_test, rev_x_test, for_cnn_test = x_test[:, :
                                                  for_input_len], x_test[:,
                                                                         for_input_len:
                                                                         2 *
                                                                         for_input_len], x_test[:,
                                                                                                2
                                                                                                *
                                                                                                for_input_len:]
    off_target_X_train = off_target_X_train.values
    off_target_X_test = off_target_X_test.values
    if not config.off_target:
        off_target_X_train, off_target_X_test = np.empty(
            shape=[off_target_X_train.shape[0], 0]), np.empty(
                shape=[off_target_X_test.shape[0], 0])

    if (not config.rev_seq) or (config.model_type == 'mixed'):
        rev_input, rev_x_test = np.empty(
            shape=[rev_input.shape[0], 0]), np.empty(
                shape=[rev_x_test.shape[0], 0])

    y_train = y_train.values
    filter = y_train.flatten() > 0
    y_test = y_test.values

    if config.ml_train:

        try:
            ml_train(X, extra_crispr_df, y, train_index, test_index)

        except:
            logger.debug("Fail to use random forest")
        finally:
            h2o.cluster().shutdown()
        return

    logger.debug("Building the RNN graph")
    weight_matrix = [utils.get_weight_matrix()
                     ] if config.word2vec_weight_matrix else None
    for_seq_input = Input(shape=(for_input.shape[1], ))
    rev_seq_input = Input(shape=(rev_input.shape[1], ))
    for_cnn_input = Input(shape=(for_cnn.shape[1], ))
    bio_features = Input(shape=(extra_x_train.shape[1], ))
    off_target_features = Input(shape=(off_target_X_train.shape[1], ))
    all_features = Input(shape=(for_input.shape[1] + rev_input.shape[1] +
                                extra_x_train.shape[1] +
                                off_target_X_train.shape[1], ))
    if not config.ensemble:
        crispr_model = models.CrisprCasModel(
            bio_features=bio_features,
            for_seq_input=for_seq_input,
            rev_seq_input=rev_seq_input,
            weight_matrix=weight_matrix,
            off_target_features=off_target_features,
            all_features=all_features).get_model()
    else:
        crispr_model = models.CrisprCasModel(
            bio_features=bio_features,
            for_seq_input=for_seq_input,
            rev_seq_input=rev_seq_input,
            for_cnn_input=for_cnn_input,
            weight_matrix=weight_matrix,
            off_target_features=off_target_features,
            all_features=all_features).get_model()

    if config.retraining:
        loaded_model = load_model(config.retraining_model,
                                  custom_objects={
                                      'revised_mse_loss':
                                      utils.revised_mse_loss,
                                      'tf': tf
                                  })
        for layer in loaded_model.layers:
            print(layer.name)

        if config.model_type == 'cnn':

            for_layer = loaded_model.get_layer(name='embedding_1')
            for_layer.trainable = config.fine_tune_trainable

            full_connected = loaded_model.get_layer(name='sequential_6')

        elif (config.model_type == 'mixed') or (config.model_type
                                                == 'ensemble'):

            for_layer = loaded_model.get_layer(name='sequential_5')
            if config.frozen_embedding_only:
                for_layer = for_layer.get_layer(name='embedding_1')
            for_layer.trainable = config.fine_tune_trainable

            cnn_layer = loaded_model.get_layer(name='embedding_2')
            cnn_layer.trainable = config.fine_tune_trainable
            if not config.frozen_embedding_only:
                cnn_layer_1 = loaded_model.get_layer(name='sequential_3')
                cnn_layer_2 = loaded_model.get_layer(name='sequential_4')
                cnn_layer_1.trainable = config.fine_tune_trainable
                cnn_layer_2.trainable = config.fine_tune_trainable

            full_connected = loaded_model.get_layer(name='sequential_6')

        else:
            for_layer = loaded_model.get_layer(name='sequential_5')
            if config.frozen_embedding_only:

                for_layer = for_layer.get_layer(name='embedding_1')
            for_layer.trainable = config.fine_tune_trainable
            if config.rev_seq:
                rev_layer = loaded_model.get_layer(name='sequential_2')
                if config.frozen_embedding_only:
                    rev_layer = rev_layer.get_layer(name='embedding_2')
                rev_layer.trainable = config.fine_tune_trainable
                full_connected = loaded_model.get_layer(name='sequential_3')
            else:
                full_connected = loaded_model.get_layer(name='sequential_6')

        for i in range(
                int((len(full_connected.layers) / 4) *
                    (1 - config.fullly_connected_train_fraction))):

            dense_layer = full_connected.get_layer(name='dense_' + str(i + 1))
            dense_layer.trainable = config.fine_tune_trainable

        crispr_model = models.CrisprCasModel.compile_transfer_learning_model(
            loaded_model)

    utils.output_model_info(crispr_model)
    logger.debug("Built the RNN model successfully")

    try:
        if config.training:
            logger.debug("Training the model")
            # x_train = x_train.values.astype('int32').reshape((-1, 21, 200))
            checkpoint = ModelCheckpoint(config.temp_hdf5_path,
                                         verbose=1,
                                         save_best_only=True,
                                         period=1)
            reduce_lr = LearningRateScheduler(utils.cosine_decay_lr)

            logger.debug("augmenting data")
            processed_for_input = utils.augment_data(
                for_input, filter=filter,
                is_seq=True) if config.augment_data else for_input

            if config.augment_data:
                if rev_input.shape[0] and rev_input.shape[1]:
                    processed_rev_input = utils.augment_data(rev_input,
                                                             filter=filter,
                                                             is_seq=True,
                                                             is_rev=True)
                else:
                    processed_rev_input = utils.augment_data(rev_input,
                                                             filter=filter)
            else:
                processed_rev_input = rev_input

            processed_off_target_X_train = utils.augment_data(
                off_target_X_train,
                filter=filter) if config.augment_data else off_target_X_train
            processed_extra_x_train = utils.augment_data(
                extra_x_train,
                filter=filter) if config.augment_data else extra_x_train
            processed_y_train = utils.augment_data(
                y_train, filter=filter) if config.augment_data else y_train
            logger.debug("augmented data successfully")

            logger.debug("selecting %d data for training" %
                         (config.retraining_datasize * len(processed_y_train)))
            index_range = list(range(len(processed_y_train)))
            np.random.shuffle(index_range)
            selected_index = index_range[:int(config.retraining_datasize *
                                              len(processed_y_train))]
            logger.debug("selecting %d data for training" %
                         (config.retraining_datasize * len(processed_y_train)))

            features_list = [
                processed_for_input[selected_index],
                processed_rev_input[selected_index],
                processed_off_target_X_train[selected_index],
                processed_extra_x_train[selected_index]
            ]

            if config.ensemble:
                processed_for_cnn = utils.augment_data(
                    for_cnn, filter=filter,
                    is_seq=True) if config.augment_data else for_cnn
                features_list.append(processed_for_cnn[selected_index])
                print("ensemble")
                print(len(features_list))

            training_history = utils.print_to_training_log(crispr_model.fit)(
                x=features_list,
                validation_split=0.05,
                y=processed_y_train[selected_index],
                epochs=config.n_epochs,
                batch_size=config.batch_size,
                verbose=2,
                callbacks=[checkpoint, reduce_lr])

            logger.debug("Saving history")
            with open(config.training_history, 'wb') as history_file:
                pickle.dump(training_history.history, history_file)
            logger.debug("Saved training history successfully")

            logger.debug("Trained crispr model successfully")

        else:
            logger.debug("Logging in old model")
            loaded_model = load_model(config.old_model_hdf5,
                                      custom_objects={
                                          'revised_mse_loss':
                                          utils.revised_mse_loss,
                                          'tf': tf
                                      })
            crispr_model = models.CrisprCasModel.compile_transfer_learning_model(
                loaded_model)
            crispr_model.save(config.temp_hdf5_path)
            logger.debug("Load in model successfully")

    except KeyboardInterrupt as e:

        logger.debug("Loading model")
        loaded_model = load_model(config.temp_hdf5_path,
                                  custom_objects={
                                      'revised_mse_loss':
                                      utils.revised_mse_loss,
                                      'tf': tf
                                  })
        crispr_model = models.CrisprCasModel.compile_transfer_learning_model(
            loaded_model)
        logger.debug("Load in model successfully")

    logger.debug("Persisting model")
    # serialize weights to HDF5
    crispr_model.save(config.hdf5_path)
    print("Saved model to disk")

    logger.debug("Loading best model for testing")
    loaded_model = load_model(config.temp_hdf5_path,
                              custom_objects={
                                  'revised_mse_loss': utils.revised_mse_loss,
                                  'tf': tf
                              })
    crispr_model = models.CrisprCasModel.compile_transfer_learning_model(
        loaded_model)
    logger.debug("Load in model successfully")

    logger.debug("Predicting data with best model")
    train_list = [
        for_input[unique_train_index], rev_input[unique_train_index],
        off_target_X_train[unique_train_index],
        extra_x_train[unique_train_index]
    ]
    if config.ensemble:
        train_list.append(for_cnn[unique_train_index])
    train_prediction = crispr_model.predict(x=train_list)
    train_performance = spearmanr(train_prediction,
                                  y_train[unique_train_index])
    logger.debug(
        "GRU model spearman correlation coefficient for training dataset is: %s"
        % str(train_performance))

    get_prediction = getattr(sys.modules[__name__],
                             "get_prediction_" + config.test_method,
                             get_prediction_group)
    test_list = [for_x_test, rev_x_test, off_target_X_test, extra_x_test]
    if config.ensemble:
        test_list.append(for_cnn_test)
    performance, prediction = get_prediction(crispr_model, test_index_list,
                                             unique_test_index, y_test,
                                             test_list)
    logger.debug("GRU model spearman correlation coefficient: %s" %
                 str(performance))

    logger.debug("Loading last model for testing")
    loaded_model = load_model(config.hdf5_path,
                              custom_objects={
                                  'revised_mse_loss': utils.revised_mse_loss,
                                  'tf': tf
                              })
    crispr_model = models.CrisprCasModel.compile_transfer_learning_model(
        loaded_model)
    logger.debug("Load in model successfully")

    logger.debug("Predicting data with last model")
    last_train_prediction = crispr_model.predict(x=train_list)
    last_train_performance = spearmanr(last_train_prediction,
                                       y_train[unique_train_index])
    utils.output_config_info()
    logger.debug(
        "GRU model spearman correlation coefficient for training dataset is: %s"
        % str(last_train_performance))

    last_performance, last_prediction = get_prediction(crispr_model,
                                                       test_index_list,
                                                       unique_test_index,
                                                       y_test, test_list)
    logger.debug("GRU model spearman correlation coefficient: %s" %
                 str(last_performance))

    logger.debug("Saving test and prediction data plot")
    if last_performance > performance:
        prediction = last_prediction
    utils.ytest_and_prediction_output(y_test[unique_test_index], prediction)
    logger.debug("Saved test and prediction data plot successfully")

    if config.check_feature_importance:
        if performance > last_performance:
            loaded_model = load_model(config.temp_hdf5_path,
                                      custom_objects={
                                          'revised_mse_loss':
                                          utils.revised_mse_loss,
                                          'tf': tf
                                      })
            crispr_model = models.CrisprCasModel.compile_transfer_learning_model(
                loaded_model)
        logger.debug("Getting features ranks")
        names = []
        names += ["for_" + str(i) for i in range(for_input.shape[1])]
        names += ["rev_" + str(i) for i in range(rev_input.shape[1])]
        names += ["off_" + str(i) for i in range(off_target_X_train.shape[1])]
        names += config.extra_categorical_features + config.extra_numerical_features
        ranker = feature_imp.InputPerturbationRank(names)
        feature_ranks = ranker.rank(
            20, y_test[unique_test_index], crispr_model,
            [data[unique_test_index] for data in test_list])
        feature_ranks_df = pd.DataFrame(feature_ranks)
        feature_ranks_df.to_csv(config.feature_importance_path, index=False)
        logger.debug("Get features ranks successfully")
    for current_file_path in file_list:

        file_id = current_file_path.split("/")[6].split(".")[0]

        mfcc_save_path = "../../data/augmented_data/" + str(
            flag) + "mfcc/" + str(file_id) + ".png"
        spec_save_path = "../../data/augmented_data/" + str(
            flag) + "spec/" + str(file_id) + ".png"
        raw_save_path = "../../data/augmented_data/" + str(
            flag) + "raw_vectors/" + str(file_id) + ".npy"
        wav_save_path = "../../data/augmented_data/" + str(
            flag) + "wave_files/" + str(file_id) + ".png"

        print(current_file_path)
        y, sr = librosa.load(current_file_path)
        augmented_data = augment_data(y)
        for i in range(4):

            mfcc_save_path = "../../data/augmented_data/" + str(
                flag) + "/mfcc/" + str(file_id) + "_" + str(i) + ".png"
            spec_save_path = "../../data/augmented_data/" + str(
                flag) + "/spec/" + str(file_id) + "_" + str(i) + ".png"
            raw_save_path = "../../data/augmented_data/" + str(
                flag) + "/raw_vectors/" + str(file_id) + "_" + str(i) + ".npy"
            wav_save_path = "../../data/augmented_data/" + str(
                flag) + "/wav_files/" + str(file_id) + "_" + str(i) + ".png"

            current_data = augmented_data[i]
            if (len(current_data) != 110250):
                print("*******************YOLO*********************")
            # temp_path = str(i)+".png"