Exemple #1
0
def main(args):
    np.random.seed(args.seed)

    # CPU only instead of GPU
    if args.cpu_only:
        logging.info('Setting env for CPU-only mode...')
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
        os.environ["CUDA_VISIBLE_DEVICES"] = '-1'

    # Load and process data
    # Provide objective to load
    recall_0 = Utils.class_recall(0)
    recall_1 = Utils.class_recall(1)
    custom_obj = {'metr': recall_0}

    logging.info('Loading model...')
    ## pkl
    logging.info('  Loading mstd...')
    F = os.path.join(args.model_path, args.mstd_name)
    if not os.path.exists(F):
        msg = 'Model file not available at data-path: {}'
        raise IOError(msg.format(F))
    with open(F, 'rb') as mstd:
        mean_tr, std_tr = pickle.load(mstd)
    ## h5
    logging.info('  Loading h5...')
    F = os.path.join(args.model_path, args.model_name)
    if not os.path.exists(F):
        msg = 'Model file not available at data-path: {}'
        raise IOError(msg.format(F))
    model = load_model(F, custom_objects=custom_obj)

    # outdir
    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    logging.info('Loading features...')
    x, y, i2n = Utils.load_features_nogt(args.feature_file_table,
                                         force_overwrite=args.force_overwrite,
                                         pickle_only=args.pickle_only,
                                         n_procs=args.n_procs)

    logging.info('Loaded {} contigs'.format(len(set(i2n.values()))))
    n2i = Utils.reverse_dict(i2n)
    x = [xi for xmeta in x for xi in xmeta]
    y = np.concatenate(y)

    logging.info('Running model generator...')
    dataGen = Models.Generator(x,
                               y,
                               batch_size=64,
                               shuffle=False,
                               norm_raw=0,
                               mean_tr=mean_tr,
                               std_tr=std_tr)

    logging.info('Computing predictions...')
    scores = Utils.compute_predictions(n2i, dataGen, model, args.save_path,
                                       args.save_name)
Exemple #2
0
    def __init__(self, config):
        self.max_len = config.max_len
        self.filters = config.filters
        self.n_conv = config.n_conv
        self.n_features = config.n_features
        self.pool_window = config.pool_window
        self.dropout = config.dropout
        self.lr_init = config.lr_init
        self.n_fc = config.n_fc
        self.n_hid = config.n_hid

        self.net = Sequential()

        self.net.add(
            Conv2D(self.filters,
                   kernel_size=(2, self.n_features),
                   input_shape=(self.max_len, self.n_features, 1),
                   activation='relu',
                   padding='valid'))
        self.net.add(BatchNormalization(axis=-1))

        for i in range(1, self.n_conv):
            self.net.add(
                Conv2D(2**i * self.filters,
                       kernel_size=(2, 1),
                       strides=2,
                       input_shape=(self.max_len, 1,
                                    2**(i - 1) * self.filters),
                       activation='relu'))
            self.net.add(BatchNormalization(axis=-1))

        self.net.add(AveragePooling2D((self.pool_window, 1)))
        self.net.add(Flatten())

        optimizer = keras.optimizers.adam(lr=self.lr_init)

        # binary classification
        for _ in range(self.n_fc - 1):
            self.net.add(Dense(self.n_hid, activation='relu'))
            self.net.add(Dropout(rate=self.dropout))

        self.net.add(Dense(1, activation='sigmoid'))
        self.net.add(Dropout(rate=self.dropout))

        recall_0 = Utils.class_recall(0)
        recall_1 = Utils.class_recall(1)
        self.net.compile(loss='binary_crossentropy',
                         optimizer=optimizer,
                         metrics=[recall_0, recall_1])

        self.reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                           factor=0.5,
                                                           patience=5,
                                                           min_lr=0.01 *
                                                           self.lr_init)
Exemple #3
0
    def __init__(self,
                 x,
                 y,
                 max_len=10000,
                 batch_size=32,
                 shuffle=True,
                 norm_raw=True,
                 mean_tr=None,
                 std_tr=None):
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.max_len = max_len
        self.x = x
        self.y = y
        self.shuffle = shuffle
        self.n_feat = x[0].shape[1]

        if mean_tr is None:
            mean, std = Utils.compute_mean_std(self.x)
            self.mean = mean
            self.std = std
            if not norm_raw:
                self.mean[0:4] = 0
                self.std[0:4] = 1
        else:
            self.mean = mean_tr
            self.std = std_tr

        # Shuffle data
        self.indices = np.arange(len(x))
        if self.shuffle:
            np.random.shuffle(self.indices)

        self.on_epoch_end()
Exemple #4
0
def main(args):    
    np.random.seed(12)
    
    # Build model
    config = Config(args)

    deepmased = Models.deepmased(config)
    deepmased.print_summary()

    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    save_path = args.save_path

    # Load and process data
    logging.info('Loading data...')
    x, y = Utils.load_features_tr(args.data_path,
                                  max_len=args.max_len,
                                  standard=args.standard,
                                  mode = config.mode, 
                                  pickle_only=args.pickle_only)

    if args.n_folds == -1:
        # Append elements in x
        x = [item for sl in x for item in sl]
        y = np.concatenate(y)


    if args.n_folds > -1:
        if os.path.exists(os.path.join(save_path, str(args.n_folds - 1) + '_model.h5')):
            exit()

        ap_scores = []
        for val_idx in range(args.n_folds):
            x_tr, x_val, y_tr, y_val = Utils.kfold(x, y, val_idx, k=args.n_folds)
            deepmased = Models.deepmased(config)

            #Construct generator
            dataGen = Models.Generator(x_tr, y_tr, args.max_len,
                                       batch_size=64, norm_raw=bool(args.norm_raw))

            # Init validation generator and 
            dataGen_val = Models.Generator(x_val, y_val, args.max_len, batch_size=64, 
                                           shuffle=False, norm_raw=bool(args.norm_raw), 
                                           mean_tr=dataGen.mean, std_tr=dataGen.std)


            #Train model
            tb_logs = keras.callbacks.TensorBoard(log_dir=os.path.join(save_path, 'logs'), 
                                                  histogram_freq=0, 
                                                  write_graph=True, write_images=True)
            logging.info('Training network...')
            if config.mode in ['chimera', 'extensive']:
                w_one = int(len(np.where(y_tr == 0)[0])  / len(np.where(y_tr == 1)[0]))
                class_weight = {0 : 1 , 1: w_one}
                deepmased.net.fit_generator(generator=dataGen, 
                                            validation_data=dataGen_val,
                                            epochs=args.n_epochs, 
                                            use_multiprocessing=True,
                                            verbose=2,
                                            callbacks=[tb_logs, deepmased.reduce_lr])
            elif config.mode == 'edit':
                st = StandardScaler()
                y_tr = st.fit_transform(y_tr)
                y_te = st.transform(y_te)
                deepmased.net.fit(x_tr, y_tr, validation_data=(x_te, y_te),
                                  epochs=args.n_epochs, 
                                  callbacks=[tb_logs, deepmased.reduce_lr])
            logging.info('Computing AUC scores...')
            scores_val = deepmased.predict_generator(dataGen_val)

            ap_scores.append(average_precision_score(y_val[0 : scores_val.size], scores_val))

            deepmased.save(os.path.join(save_path, str(val_idx) + '_model.h5'))

            with open(os.path.join(save_path, 'scores.pkl'), 'wb') as f:
                pickle.dump(ap_scores, f)

    else:
        dataGen = Models.Generator(x, y, args.max_len, batch_size=64,
                                   norm_raw=bool(args.norm_raw))
        deepmased = Models.deepmased(config)
        tb_logs = keras.callbacks.TensorBoard(log_dir=os.path.join(save_path, 'logs_final'), 
                                              histogram_freq=0, 
                                              write_graph=True, write_images=True)
        logging.info('Training network...')
        if config.mode in ['chimera', 'extensive']:
            w_one = int(len(np.where(y == 0)[0])  / len(np.where(y == 1)[0]))
            class_weight = {0 : 1 , 1: w_one}
            deepmased.net.fit_generator(generator=dataGen, 
                                        epochs=args.n_epochs, 
                                        use_multiprocessing=True,
                                        verbose=2,
                                        callbacks=[tb_logs, deepmased.reduce_lr])

        logging.info('Saving trained model...')
        outfile = os.path.join(save_path, 'final_model.h5')
        deepmased.save(outfile)
        logging.info('  File written: {}'.format(outfile))

        outfile = os.path.join(save_path, 'mean_std_final_model.pkl')
        with open(outfile, 'wb') as f:
            pickle.dump([dataGen.mean, dataGen.std], f)
        logging.info('  File written: {}'.format(outfile))
Exemple #5
0
def main(args):
    """Main interface
    """
    np.random.seed(12)

    save_plot = args.save_plot
    if save_plot is None:
        save_plot = args.save_path

    # Load and process data
    # Provide objective to load
    recall_0 = Utils.class_recall(0)
    recall_1 = Utils.class_recall(1)
    custom_obj = {'metr': recall_0}

    path_to_models = os.listdir(args.save_path)
    auc = []

    for model_path in path_to_models:
        if not os.path.exists(
            (os.path.join(args.save_path, model_path, 'final_model.h5'))):
            continue

        if not os.path.exists(
                os.path.join(args.save_path, model_path, 'predictions')):
            os.makedirs(os.path.join(args.save_path, model_path,
                                     'predictions'))

        if not os.path.exists(
                os.path.join(args.save_path, model_path, 'predictions',
                             args.data_path.split('/')[-1])):
            os.makedirs(
                os.path.join(args.save_path, model_path, 'predictions',
                             args.data_path.split('/')[-1]))

        F = os.path.join(args.save_path, model_path,
                         'mean_std_final_model.pkl')
        with open(F, 'rb') as mstd:
            mean_tr, std_tr = pickle.load(mstd)

        model = load_model(os.path.join(args.save_path, model_path,
                                        'final_model.h5'),
                           custom_objects=custom_obj)

        tech = args.technology

        logging.info('Loading data...')
        if args.is_synthetic == 1:
            x, y, i2n = Utils.load_features(args.data_path,
                                            max_len=args.max_len,
                                            mode=args.mode,
                                            technology=tech)
        else:
            x, y, i2n = Utils.load_features_nogt(args.data_path,
                                                 max_len=args.max_len,
                                                 mode=args.mode)

        logging.info('Loaded {} contigs...'.format(len(set(i2n.values()))))

        n2i = Utils.reverse_dict(i2n)
        x = [xi for xmeta in x for xi in xmeta]
        y = np.concatenate(y)

        dataGen = Models.Generator(x,
                                   y,
                                   args.max_len,
                                   batch_size=64,
                                   shuffle=False,
                                   norm_raw=bool(args.norm_raw),
                                   mean_tr=mean_tr,
                                   std_tr=std_tr)

        loggin.info('Computing predictions for {}...'.format(tech))

        scores = compute_predictions(y, n2i)
        outfile = os.path.join(args.save_path, model_path, 'predictions',
                               args.data_path.split('/')[-1], tech + '.pkl')
        with open(outfile, 'wb') as spred:
            pickle.dump(scores, spred)
        logging.info('File written: {}'.format(outfile))
Exemple #6
0
def main(args):
    """Main interface
    """
    # init
    np.random.seed(args.seed)
    ## where to save the plot
    save_plot = args.save_plot
    if save_plot is None:
        save_plot = args.save_path

    # Load and process data
    # Provide objective to load
    logging.info('Loading data...')
    recall_0 = Utils.class_recall(0)
    recall_1 = Utils.class_recall(1)
    custom_obj = {'metr': recall_0}

    h5_file = os.path.join(args.model_path, args.model_name)
    if not os.path.exists(h5_file):
        msg = 'Cannot find {} file in {}'
        raise IOError(msg.format(args.model_name, args.model_path))
    logging.info('Loading model: {}'.format(h5_file))
    model = load_model(h5_file, custom_objects=custom_obj)

    # model pkl
    pkl_file = os.path.join(args.model_path, args.mstd_name)
    logging.info('Loading file: {}'.format(pkl_file))
    with open(pkl_file, 'rb') as mstd:
        mean_tr, std_tr = pickle.load(mstd)

    # loading features
    if args.is_synthetic == 1:
        logging.info('Loading synthetic features')
        x, y, i2n = Utils.load_features(args.feature_file_table,
                                        max_len=args.max_len,
                                        technology=args.technology,
                                        force_overwrite=args.force_overwrite,
                                        n_procs=args.n_procs)
    else:
        logging.info('Loading non-synthetic features')
        x, y, i2n = Utils.load_features_nogt(
            args.feature_file_table,
            max_len=args.max_len,
            force_overwrite=args.force_overwrite,
            n_procs=args.n_procs)

    logging.info('Loaded {} contigs'.format(len(set(i2n.values()))))
    n2i = Utils.reverse_dict(i2n)
    x = [xi for xmeta in x for xi in xmeta]
    y = np.concatenate(y)

    logging.info('Running model generator...')
    dataGen = Models.Generator(x,
                               y,
                               args.max_len,
                               batch_size=64,
                               shuffle=False,
                               norm_raw=bool(args.norm_raw),
                               mean_tr=mean_tr,
                               std_tr=std_tr)

    logging.info('Computing predictions for {}...'.format(args.technology))
    scores = Utils.compute_predictions_y_known(y, n2i, model, dataGen)
    outfile = os.path.join(
        args.save_path, '_'.join([args.save_name, args.technology + '.pkl']))
    with open(outfile, 'wb') as spred:
        pickle.dump(scores, spred)
    logging.info('File written: {}'.format(outfile))
Exemple #7
0
def main(args):
    # init
    np.random.seed(args.seed)
    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)
    save_path = args.save_path
    
    # Build model
    config = Config(args)
    if not args.pickle_only:
        logging.info('Building model')
        deepmased = Models.deepmased(config)
        deepmased.print_summary()

    # Load and process data
    x, y = Utils.load_features_tr(args.feature_file_table,
                                  max_len=args.max_len,
                                  technology = args.technology,
                                  pickle_only = args.pickle_only,
                                  force_overwrite = args.force_overwrite,
                                  n_procs = args.n_procs)

    # kfold cross validation
    if args.n_folds >= 0:
        logging.info('Running kfold cross validation. n-folds: {}'.format(args.n_folds))
        outfile_h5 = os.path.join(save_path, str(args.n_folds - 1) + '_model.h5')
        if os.path.exists(outfile_h5) and args.force_overwrite is False:
            msg = 'Output already exists ({}). Use --force-overwrite to overwrite the file'
            raise IOError(msg.format(outfile_h5))

        # iter over folds
        ap_scores = []
        for val_idx in range(args.n_folds):
            logging.info('Fold {}: Constructing model...'.format(val_idx))        
            x_tr, x_val, y_tr, y_val = Utils.kfold(x, y, val_idx, k=args.n_folds)
            deepmased = Models.deepmased(config)

            #Construct generator
            dataGen = Models.Generator(x_tr, y_tr, args.max_len,
                                       batch_size=64, norm_raw=bool(args.norm_raw))

            # Init validation generator and 
            dataGen_val = Models.Generator(x_val, y_val, args.max_len, batch_size=64, 
                                           shuffle=False, norm_raw=bool(args.norm_raw), 
                                           mean_tr=dataGen.mean, std_tr=dataGen.std)

            #Train model
            tb_logs = keras.callbacks.TensorBoard(log_dir=os.path.join(save_path, 'logs'), 
                                                  histogram_freq=0, 
                                                  write_graph=True, write_images=True)
            logging.info('Fold {}: Training network...'.format(val_idx))
            ## binary classification (extensive misassembly)
            try:
                w_one = int(len(np.where(y_tr == 0)[0])  / len(np.where(y_tr == 1)[0]))
            except ZeroDivisionError:
                logging.warning('  No misassemblies present!')
                w_one = 0
            class_weight = {0 : 1 , 1: w_one}
            deepmased.net.fit_generator(generator=dataGen, 
                                        validation_data=dataGen_val,
                                        epochs=args.n_epochs, 
                                        use_multiprocessing=args.n_procs > 1,
                                        workers=args.n_procs,
                                        verbose=2,
                                        callbacks=[tb_logs, deepmased.reduce_lr])
            # AUC scores
            logging.info('Fold {}: Computing AUC scores...'.format(val_idx))
            scores_val = deepmased.predict_generator(dataGen_val)
            ap_scores.append(average_precision_score(y_val[0 : scores_val.size], scores_val))

            # Saving data
            outfile_h5_fold = os.path.join(save_path, str(val_idx) + '_model.h5')
            deepmased.save(outfile_h5_fold)
            logging.info('Fold {}: File written: {}'.format(val_idx, outfile_h5_fold))
            outfile_pkl_fold = os.path.join(save_path, 'scores.pkl')
            with open(outfile_pkl_fold, 'wb') as f:
                pickle.dump(ap_scores, f)
            logging.info('Fold {}: File written: {}'.format(val_idx, outfile_pkl_fold))

    else:
        # Skip kfold and simply pool all the data for training
        ## all elements in x and y are combined
        logging.info('NOTE: Training on all pooled data!')
        x = [item for sl in x for item in sl]
        y = np.concatenate(y)
        
#         #downsample to half
#         import random
#         dwnsample = np.array(random.sample(range(len(y)), int(len(y)/2)))
#         x = np.array(x)[dwnsample]
#         y = np.array(y)[dwnsample]

        logging.info('Constructing model...')
        dataGen = Models.Generator(x, y, args.max_len, batch_size=64,
                                   norm_raw=bool(args.norm_raw))
        deepmased = Models.deepmased(config)
        tb_logs = keras.callbacks.TensorBoard(log_dir=os.path.join(save_path, 'logs_final'), 
                                              histogram_freq=0, 
                                              write_graph=True, write_images=True)
        
        logging.info('Training network...')
        deepmased.net.fit_generator(generator=dataGen,
                                    epochs=args.n_epochs, 
                                    use_multiprocessing=args.n_procs > 1,
                                    workers=args.n_procs,
                                    verbose=2,
                                    callbacks=[tb_logs, deepmased.reduce_lr])
            
        logging.info('Saving trained model...')
        x = [args.save_name, args.technology, 'model.h5']
        outfile = os.path.join(save_path, '_'.join(x))
        deepmased.save(outfile)
        logging.info('  File written: {}'.format(outfile))        
        x = [args.save_name, args.technology, 'mean_std.pkl']
        outfile = os.path.join(save_path, '_'.join(x))
        with open(outfile, 'wb') as f:
            pickle.dump([dataGen.mean, dataGen.std], f)
        logging.info('  File written: {}'.format(outfile))
Exemple #8
0
    def __init__(self, config):
        self.max_len = config.max_len
        self.filters = config.filters
        self.n_conv = config.n_conv
        self.n_features = config.n_features
        self.pool_window = config.pool_window
        self.dropout = config.dropout
        self.lr_init = config.lr_init
        self.mode = config.mode
        self.n_fc = config.n_fc
        self.n_hid = config.n_hid

        self.net = Sequential()

        self.net.add(
            Conv2D(self.filters,
                   kernel_size=(2, self.n_features),
                   input_shape=(self.max_len, self.n_features, 1),
                   activation='relu',
                   padding='valid'))
        self.net.add(BatchNormalization(axis=-1))

        for i in range(1, self.n_conv):
            self.net.add(
                Conv2D(2**i * self.filters,
                       kernel_size=(2, 1),
                       strides=2,
                       input_shape=(self.max_len, 1,
                                    2**(i - 1) * self.filters),
                       activation='relu'))
            self.net.add(BatchNormalization(axis=-1))

        self.net.add(AveragePooling2D((self.pool_window, 1)))
        self.net.add(Flatten())

        optimizer = keras.optimizers.adam(lr=self.lr_init)

        if self.mode in ['chimera', 'extensive']:
            for _ in range(self.n_fc - 1):
                self.net.add(Dense(self.n_hid, activation='relu'))
                self.net.add(Dropout(rate=self.dropout))

            self.net.add(Dense(1, activation='sigmoid'))
            self.net.add(Dropout(rate=self.dropout))

            recall_0 = Utils.class_recall(0)
            recall_1 = Utils.class_recall(1)
            self.net.compile(loss='binary_crossentropy',
                             optimizer=optimizer,
                             metrics=[recall_0, recall_1])
        elif self.mode == 'edit':
            self.net.add(Dense(20, activation='relu'))
            self.net.add(Dropout(rate=dropout))
            self.net.add(Dense(20, activation='relu'))
            self.net.add(Dropout(rate=dropout))
            self.net.add(Dense(1, activation='linear'))
            self.net.compile(loss='mean_absolute_error',
                             optimizer=optimizer,
                             metrics=[Utils.explained_var])
        else:
            raise ('Training mode "{}" not supported.'.format(mode))

        self.reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                           factor=0.5,
                                                           patience=5,
                                                           min_lr=0.01 *
                                                           self.lr_init)