Example #1
0
def main(args):
    np.random.seed(args.seed)

    # CPU only instead of GPU
    if args.cpu_only:
        logging.info('Setting env for CPU-only mode...')
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
        os.environ["CUDA_VISIBLE_DEVICES"] = '-1'

    # Load and process data
    # Provide objective to load
    recall_0 = Utils.class_recall(0)
    recall_1 = Utils.class_recall(1)
    custom_obj = {'metr': recall_0}

    logging.info('Loading model...')
    ## pkl
    logging.info('  Loading mstd...')
    F = os.path.join(args.model_path, args.mstd_name)
    if not os.path.exists(F):
        msg = 'Model file not available at data-path: {}'
        raise IOError(msg.format(F))
    with open(F, 'rb') as mstd:
        mean_tr, std_tr = pickle.load(mstd)
    ## h5
    logging.info('  Loading h5...')
    F = os.path.join(args.model_path, args.model_name)
    if not os.path.exists(F):
        msg = 'Model file not available at data-path: {}'
        raise IOError(msg.format(F))
    model = load_model(F, custom_objects=custom_obj)

    # outdir
    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    logging.info('Loading features...')
    x, y, i2n = Utils.load_features_nogt(args.feature_file_table,
                                         force_overwrite=args.force_overwrite,
                                         pickle_only=args.pickle_only,
                                         n_procs=args.n_procs)

    logging.info('Loaded {} contigs'.format(len(set(i2n.values()))))
    n2i = Utils.reverse_dict(i2n)
    x = [xi for xmeta in x for xi in xmeta]
    y = np.concatenate(y)

    logging.info('Running model generator...')
    dataGen = Models.Generator(x,
                               y,
                               batch_size=64,
                               shuffle=False,
                               norm_raw=0,
                               mean_tr=mean_tr,
                               std_tr=std_tr)

    logging.info('Computing predictions...')
    scores = Utils.compute_predictions(n2i, dataGen, model, args.save_path,
                                       args.save_name)
Example #2
0
def main(args):    
    np.random.seed(12)
    
    # Build model
    config = Config(args)

    deepmased = Models.deepmased(config)
    deepmased.print_summary()

    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    save_path = args.save_path

    # Load and process data
    logging.info('Loading data...')
    x, y = Utils.load_features_tr(args.data_path,
                                  max_len=args.max_len,
                                  standard=args.standard,
                                  mode = config.mode, 
                                  pickle_only=args.pickle_only)

    if args.n_folds == -1:
        # Append elements in x
        x = [item for sl in x for item in sl]
        y = np.concatenate(y)


    if args.n_folds > -1:
        if os.path.exists(os.path.join(save_path, str(args.n_folds - 1) + '_model.h5')):
            exit()

        ap_scores = []
        for val_idx in range(args.n_folds):
            x_tr, x_val, y_tr, y_val = Utils.kfold(x, y, val_idx, k=args.n_folds)
            deepmased = Models.deepmased(config)

            #Construct generator
            dataGen = Models.Generator(x_tr, y_tr, args.max_len,
                                       batch_size=64, norm_raw=bool(args.norm_raw))

            # Init validation generator and 
            dataGen_val = Models.Generator(x_val, y_val, args.max_len, batch_size=64, 
                                           shuffle=False, norm_raw=bool(args.norm_raw), 
                                           mean_tr=dataGen.mean, std_tr=dataGen.std)


            #Train model
            tb_logs = keras.callbacks.TensorBoard(log_dir=os.path.join(save_path, 'logs'), 
                                                  histogram_freq=0, 
                                                  write_graph=True, write_images=True)
            logging.info('Training network...')
            if config.mode in ['chimera', 'extensive']:
                w_one = int(len(np.where(y_tr == 0)[0])  / len(np.where(y_tr == 1)[0]))
                class_weight = {0 : 1 , 1: w_one}
                deepmased.net.fit_generator(generator=dataGen, 
                                            validation_data=dataGen_val,
                                            epochs=args.n_epochs, 
                                            use_multiprocessing=True,
                                            verbose=2,
                                            callbacks=[tb_logs, deepmased.reduce_lr])
            elif config.mode == 'edit':
                st = StandardScaler()
                y_tr = st.fit_transform(y_tr)
                y_te = st.transform(y_te)
                deepmased.net.fit(x_tr, y_tr, validation_data=(x_te, y_te),
                                  epochs=args.n_epochs, 
                                  callbacks=[tb_logs, deepmased.reduce_lr])
            logging.info('Computing AUC scores...')
            scores_val = deepmased.predict_generator(dataGen_val)

            ap_scores.append(average_precision_score(y_val[0 : scores_val.size], scores_val))

            deepmased.save(os.path.join(save_path, str(val_idx) + '_model.h5'))

            with open(os.path.join(save_path, 'scores.pkl'), 'wb') as f:
                pickle.dump(ap_scores, f)

    else:
        dataGen = Models.Generator(x, y, args.max_len, batch_size=64,
                                   norm_raw=bool(args.norm_raw))
        deepmased = Models.deepmased(config)
        tb_logs = keras.callbacks.TensorBoard(log_dir=os.path.join(save_path, 'logs_final'), 
                                              histogram_freq=0, 
                                              write_graph=True, write_images=True)
        logging.info('Training network...')
        if config.mode in ['chimera', 'extensive']:
            w_one = int(len(np.where(y == 0)[0])  / len(np.where(y == 1)[0]))
            class_weight = {0 : 1 , 1: w_one}
            deepmased.net.fit_generator(generator=dataGen, 
                                        epochs=args.n_epochs, 
                                        use_multiprocessing=True,
                                        verbose=2,
                                        callbacks=[tb_logs, deepmased.reduce_lr])

        logging.info('Saving trained model...')
        outfile = os.path.join(save_path, 'final_model.h5')
        deepmased.save(outfile)
        logging.info('  File written: {}'.format(outfile))

        outfile = os.path.join(save_path, 'mean_std_final_model.pkl')
        with open(outfile, 'wb') as f:
            pickle.dump([dataGen.mean, dataGen.std], f)
        logging.info('  File written: {}'.format(outfile))
Example #3
0
def main(args):
    """Main interface
    """
    np.random.seed(12)

    save_plot = args.save_plot
    if save_plot is None:
        save_plot = args.save_path

    # Load and process data
    # Provide objective to load
    recall_0 = Utils.class_recall(0)
    recall_1 = Utils.class_recall(1)
    custom_obj = {'metr': recall_0}

    path_to_models = os.listdir(args.save_path)
    auc = []

    for model_path in path_to_models:
        if not os.path.exists(
            (os.path.join(args.save_path, model_path, 'final_model.h5'))):
            continue

        if not os.path.exists(
                os.path.join(args.save_path, model_path, 'predictions')):
            os.makedirs(os.path.join(args.save_path, model_path,
                                     'predictions'))

        if not os.path.exists(
                os.path.join(args.save_path, model_path, 'predictions',
                             args.data_path.split('/')[-1])):
            os.makedirs(
                os.path.join(args.save_path, model_path, 'predictions',
                             args.data_path.split('/')[-1]))

        F = os.path.join(args.save_path, model_path,
                         'mean_std_final_model.pkl')
        with open(F, 'rb') as mstd:
            mean_tr, std_tr = pickle.load(mstd)

        model = load_model(os.path.join(args.save_path, model_path,
                                        'final_model.h5'),
                           custom_objects=custom_obj)

        tech = args.technology

        logging.info('Loading data...')
        if args.is_synthetic == 1:
            x, y, i2n = Utils.load_features(args.data_path,
                                            max_len=args.max_len,
                                            mode=args.mode,
                                            technology=tech)
        else:
            x, y, i2n = Utils.load_features_nogt(args.data_path,
                                                 max_len=args.max_len,
                                                 mode=args.mode)

        logging.info('Loaded {} contigs...'.format(len(set(i2n.values()))))

        n2i = Utils.reverse_dict(i2n)
        x = [xi for xmeta in x for xi in xmeta]
        y = np.concatenate(y)

        dataGen = Models.Generator(x,
                                   y,
                                   args.max_len,
                                   batch_size=64,
                                   shuffle=False,
                                   norm_raw=bool(args.norm_raw),
                                   mean_tr=mean_tr,
                                   std_tr=std_tr)

        loggin.info('Computing predictions for {}...'.format(tech))

        scores = compute_predictions(y, n2i)
        outfile = os.path.join(args.save_path, model_path, 'predictions',
                               args.data_path.split('/')[-1], tech + '.pkl')
        with open(outfile, 'wb') as spred:
            pickle.dump(scores, spred)
        logging.info('File written: {}'.format(outfile))
Example #4
0
def main(args):
    """Main interface
    """
    # init
    np.random.seed(args.seed)
    ## where to save the plot
    save_plot = args.save_plot
    if save_plot is None:
        save_plot = args.save_path

    # Load and process data
    # Provide objective to load
    logging.info('Loading data...')
    recall_0 = Utils.class_recall(0)
    recall_1 = Utils.class_recall(1)
    custom_obj = {'metr': recall_0}

    h5_file = os.path.join(args.model_path, args.model_name)
    if not os.path.exists(h5_file):
        msg = 'Cannot find {} file in {}'
        raise IOError(msg.format(args.model_name, args.model_path))
    logging.info('Loading model: {}'.format(h5_file))
    model = load_model(h5_file, custom_objects=custom_obj)

    # model pkl
    pkl_file = os.path.join(args.model_path, args.mstd_name)
    logging.info('Loading file: {}'.format(pkl_file))
    with open(pkl_file, 'rb') as mstd:
        mean_tr, std_tr = pickle.load(mstd)

    # loading features
    if args.is_synthetic == 1:
        logging.info('Loading synthetic features')
        x, y, i2n = Utils.load_features(args.feature_file_table,
                                        max_len=args.max_len,
                                        technology=args.technology,
                                        force_overwrite=args.force_overwrite,
                                        n_procs=args.n_procs)
    else:
        logging.info('Loading non-synthetic features')
        x, y, i2n = Utils.load_features_nogt(
            args.feature_file_table,
            max_len=args.max_len,
            force_overwrite=args.force_overwrite,
            n_procs=args.n_procs)

    logging.info('Loaded {} contigs'.format(len(set(i2n.values()))))
    n2i = Utils.reverse_dict(i2n)
    x = [xi for xmeta in x for xi in xmeta]
    y = np.concatenate(y)

    logging.info('Running model generator...')
    dataGen = Models.Generator(x,
                               y,
                               args.max_len,
                               batch_size=64,
                               shuffle=False,
                               norm_raw=bool(args.norm_raw),
                               mean_tr=mean_tr,
                               std_tr=std_tr)

    logging.info('Computing predictions for {}...'.format(args.technology))
    scores = Utils.compute_predictions_y_known(y, n2i, model, dataGen)
    outfile = os.path.join(
        args.save_path, '_'.join([args.save_name, args.technology + '.pkl']))
    with open(outfile, 'wb') as spred:
        pickle.dump(scores, spred)
    logging.info('File written: {}'.format(outfile))
Example #5
0
def main(args):
    # init
    np.random.seed(args.seed)
    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)
    save_path = args.save_path
    
    # Build model
    config = Config(args)
    if not args.pickle_only:
        logging.info('Building model')
        deepmased = Models.deepmased(config)
        deepmased.print_summary()

    # Load and process data
    x, y = Utils.load_features_tr(args.feature_file_table,
                                  max_len=args.max_len,
                                  technology = args.technology,
                                  pickle_only = args.pickle_only,
                                  force_overwrite = args.force_overwrite,
                                  n_procs = args.n_procs)

    # kfold cross validation
    if args.n_folds >= 0:
        logging.info('Running kfold cross validation. n-folds: {}'.format(args.n_folds))
        outfile_h5 = os.path.join(save_path, str(args.n_folds - 1) + '_model.h5')
        if os.path.exists(outfile_h5) and args.force_overwrite is False:
            msg = 'Output already exists ({}). Use --force-overwrite to overwrite the file'
            raise IOError(msg.format(outfile_h5))

        # iter over folds
        ap_scores = []
        for val_idx in range(args.n_folds):
            logging.info('Fold {}: Constructing model...'.format(val_idx))        
            x_tr, x_val, y_tr, y_val = Utils.kfold(x, y, val_idx, k=args.n_folds)
            deepmased = Models.deepmased(config)

            #Construct generator
            dataGen = Models.Generator(x_tr, y_tr, args.max_len,
                                       batch_size=64, norm_raw=bool(args.norm_raw))

            # Init validation generator and 
            dataGen_val = Models.Generator(x_val, y_val, args.max_len, batch_size=64, 
                                           shuffle=False, norm_raw=bool(args.norm_raw), 
                                           mean_tr=dataGen.mean, std_tr=dataGen.std)

            #Train model
            tb_logs = keras.callbacks.TensorBoard(log_dir=os.path.join(save_path, 'logs'), 
                                                  histogram_freq=0, 
                                                  write_graph=True, write_images=True)
            logging.info('Fold {}: Training network...'.format(val_idx))
            ## binary classification (extensive misassembly)
            try:
                w_one = int(len(np.where(y_tr == 0)[0])  / len(np.where(y_tr == 1)[0]))
            except ZeroDivisionError:
                logging.warning('  No misassemblies present!')
                w_one = 0
            class_weight = {0 : 1 , 1: w_one}
            deepmased.net.fit_generator(generator=dataGen, 
                                        validation_data=dataGen_val,
                                        epochs=args.n_epochs, 
                                        use_multiprocessing=args.n_procs > 1,
                                        workers=args.n_procs,
                                        verbose=2,
                                        callbacks=[tb_logs, deepmased.reduce_lr])
            # AUC scores
            logging.info('Fold {}: Computing AUC scores...'.format(val_idx))
            scores_val = deepmased.predict_generator(dataGen_val)
            ap_scores.append(average_precision_score(y_val[0 : scores_val.size], scores_val))

            # Saving data
            outfile_h5_fold = os.path.join(save_path, str(val_idx) + '_model.h5')
            deepmased.save(outfile_h5_fold)
            logging.info('Fold {}: File written: {}'.format(val_idx, outfile_h5_fold))
            outfile_pkl_fold = os.path.join(save_path, 'scores.pkl')
            with open(outfile_pkl_fold, 'wb') as f:
                pickle.dump(ap_scores, f)
            logging.info('Fold {}: File written: {}'.format(val_idx, outfile_pkl_fold))

    else:
        # Skip kfold and simply pool all the data for training
        ## all elements in x and y are combined
        logging.info('NOTE: Training on all pooled data!')
        x = [item for sl in x for item in sl]
        y = np.concatenate(y)
        
#         #downsample to half
#         import random
#         dwnsample = np.array(random.sample(range(len(y)), int(len(y)/2)))
#         x = np.array(x)[dwnsample]
#         y = np.array(y)[dwnsample]

        logging.info('Constructing model...')
        dataGen = Models.Generator(x, y, args.max_len, batch_size=64,
                                   norm_raw=bool(args.norm_raw))
        deepmased = Models.deepmased(config)
        tb_logs = keras.callbacks.TensorBoard(log_dir=os.path.join(save_path, 'logs_final'), 
                                              histogram_freq=0, 
                                              write_graph=True, write_images=True)
        
        logging.info('Training network...')
        deepmased.net.fit_generator(generator=dataGen,
                                    epochs=args.n_epochs, 
                                    use_multiprocessing=args.n_procs > 1,
                                    workers=args.n_procs,
                                    verbose=2,
                                    callbacks=[tb_logs, deepmased.reduce_lr])
            
        logging.info('Saving trained model...')
        x = [args.save_name, args.technology, 'model.h5']
        outfile = os.path.join(save_path, '_'.join(x))
        deepmased.save(outfile)
        logging.info('  File written: {}'.format(outfile))        
        x = [args.save_name, args.technology, 'mean_std.pkl']
        outfile = os.path.join(save_path, '_'.join(x))
        with open(outfile, 'wb') as f:
            pickle.dump([dataGen.mean, dataGen.std], f)
        logging.info('  File written: {}'.format(outfile))