Ejemplo n.º 1
0
def main(args):    
    np.random.seed(12)
    
    # Build model
    config = Config(args)

    deepmased = Models.deepmased(config)
    deepmased.print_summary()

    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    save_path = args.save_path

    # Load and process data
    logging.info('Loading data...')
    x, y = Utils.load_features_tr(args.data_path,
                                  max_len=args.max_len,
                                  standard=args.standard,
                                  mode = config.mode, 
                                  pickle_only=args.pickle_only)

    if args.n_folds == -1:
        # Append elements in x
        x = [item for sl in x for item in sl]
        y = np.concatenate(y)


    if args.n_folds > -1:
        if os.path.exists(os.path.join(save_path, str(args.n_folds - 1) + '_model.h5')):
            exit()

        ap_scores = []
        for val_idx in range(args.n_folds):
            x_tr, x_val, y_tr, y_val = Utils.kfold(x, y, val_idx, k=args.n_folds)
            deepmased = Models.deepmased(config)

            #Construct generator
            dataGen = Models.Generator(x_tr, y_tr, args.max_len,
                                       batch_size=64, norm_raw=bool(args.norm_raw))

            # Init validation generator and 
            dataGen_val = Models.Generator(x_val, y_val, args.max_len, batch_size=64, 
                                           shuffle=False, norm_raw=bool(args.norm_raw), 
                                           mean_tr=dataGen.mean, std_tr=dataGen.std)


            #Train model
            tb_logs = keras.callbacks.TensorBoard(log_dir=os.path.join(save_path, 'logs'), 
                                                  histogram_freq=0, 
                                                  write_graph=True, write_images=True)
            logging.info('Training network...')
            if config.mode in ['chimera', 'extensive']:
                w_one = int(len(np.where(y_tr == 0)[0])  / len(np.where(y_tr == 1)[0]))
                class_weight = {0 : 1 , 1: w_one}
                deepmased.net.fit_generator(generator=dataGen, 
                                            validation_data=dataGen_val,
                                            epochs=args.n_epochs, 
                                            use_multiprocessing=True,
                                            verbose=2,
                                            callbacks=[tb_logs, deepmased.reduce_lr])
            elif config.mode == 'edit':
                st = StandardScaler()
                y_tr = st.fit_transform(y_tr)
                y_te = st.transform(y_te)
                deepmased.net.fit(x_tr, y_tr, validation_data=(x_te, y_te),
                                  epochs=args.n_epochs, 
                                  callbacks=[tb_logs, deepmased.reduce_lr])
            logging.info('Computing AUC scores...')
            scores_val = deepmased.predict_generator(dataGen_val)

            ap_scores.append(average_precision_score(y_val[0 : scores_val.size], scores_val))

            deepmased.save(os.path.join(save_path, str(val_idx) + '_model.h5'))

            with open(os.path.join(save_path, 'scores.pkl'), 'wb') as f:
                pickle.dump(ap_scores, f)

    else:
        dataGen = Models.Generator(x, y, args.max_len, batch_size=64,
                                   norm_raw=bool(args.norm_raw))
        deepmased = Models.deepmased(config)
        tb_logs = keras.callbacks.TensorBoard(log_dir=os.path.join(save_path, 'logs_final'), 
                                              histogram_freq=0, 
                                              write_graph=True, write_images=True)
        logging.info('Training network...')
        if config.mode in ['chimera', 'extensive']:
            w_one = int(len(np.where(y == 0)[0])  / len(np.where(y == 1)[0]))
            class_weight = {0 : 1 , 1: w_one}
            deepmased.net.fit_generator(generator=dataGen, 
                                        epochs=args.n_epochs, 
                                        use_multiprocessing=True,
                                        verbose=2,
                                        callbacks=[tb_logs, deepmased.reduce_lr])

        logging.info('Saving trained model...')
        outfile = os.path.join(save_path, 'final_model.h5')
        deepmased.save(outfile)
        logging.info('  File written: {}'.format(outfile))

        outfile = os.path.join(save_path, 'mean_std_final_model.pkl')
        with open(outfile, 'wb') as f:
            pickle.dump([dataGen.mean, dataGen.std], f)
        logging.info('  File written: {}'.format(outfile))
Ejemplo n.º 2
0
def main(args):
    # init
    np.random.seed(args.seed)
    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)
    save_path = args.save_path
    
    # Build model
    config = Config(args)
    if not args.pickle_only:
        logging.info('Building model')
        deepmased = Models.deepmased(config)
        deepmased.print_summary()

    # Load and process data
    x, y = Utils.load_features_tr(args.feature_file_table,
                                  max_len=args.max_len,
                                  technology = args.technology,
                                  pickle_only = args.pickle_only,
                                  force_overwrite = args.force_overwrite,
                                  n_procs = args.n_procs)

    # kfold cross validation
    if args.n_folds >= 0:
        logging.info('Running kfold cross validation. n-folds: {}'.format(args.n_folds))
        outfile_h5 = os.path.join(save_path, str(args.n_folds - 1) + '_model.h5')
        if os.path.exists(outfile_h5) and args.force_overwrite is False:
            msg = 'Output already exists ({}). Use --force-overwrite to overwrite the file'
            raise IOError(msg.format(outfile_h5))

        # iter over folds
        ap_scores = []
        for val_idx in range(args.n_folds):
            logging.info('Fold {}: Constructing model...'.format(val_idx))        
            x_tr, x_val, y_tr, y_val = Utils.kfold(x, y, val_idx, k=args.n_folds)
            deepmased = Models.deepmased(config)

            #Construct generator
            dataGen = Models.Generator(x_tr, y_tr, args.max_len,
                                       batch_size=64, norm_raw=bool(args.norm_raw))

            # Init validation generator and 
            dataGen_val = Models.Generator(x_val, y_val, args.max_len, batch_size=64, 
                                           shuffle=False, norm_raw=bool(args.norm_raw), 
                                           mean_tr=dataGen.mean, std_tr=dataGen.std)

            #Train model
            tb_logs = keras.callbacks.TensorBoard(log_dir=os.path.join(save_path, 'logs'), 
                                                  histogram_freq=0, 
                                                  write_graph=True, write_images=True)
            logging.info('Fold {}: Training network...'.format(val_idx))
            ## binary classification (extensive misassembly)
            try:
                w_one = int(len(np.where(y_tr == 0)[0])  / len(np.where(y_tr == 1)[0]))
            except ZeroDivisionError:
                logging.warning('  No misassemblies present!')
                w_one = 0
            class_weight = {0 : 1 , 1: w_one}
            deepmased.net.fit_generator(generator=dataGen, 
                                        validation_data=dataGen_val,
                                        epochs=args.n_epochs, 
                                        use_multiprocessing=args.n_procs > 1,
                                        workers=args.n_procs,
                                        verbose=2,
                                        callbacks=[tb_logs, deepmased.reduce_lr])
            # AUC scores
            logging.info('Fold {}: Computing AUC scores...'.format(val_idx))
            scores_val = deepmased.predict_generator(dataGen_val)
            ap_scores.append(average_precision_score(y_val[0 : scores_val.size], scores_val))

            # Saving data
            outfile_h5_fold = os.path.join(save_path, str(val_idx) + '_model.h5')
            deepmased.save(outfile_h5_fold)
            logging.info('Fold {}: File written: {}'.format(val_idx, outfile_h5_fold))
            outfile_pkl_fold = os.path.join(save_path, 'scores.pkl')
            with open(outfile_pkl_fold, 'wb') as f:
                pickle.dump(ap_scores, f)
            logging.info('Fold {}: File written: {}'.format(val_idx, outfile_pkl_fold))

    else:
        # Skip kfold and simply pool all the data for training
        ## all elements in x and y are combined
        logging.info('NOTE: Training on all pooled data!')
        x = [item for sl in x for item in sl]
        y = np.concatenate(y)
        
#         #downsample to half
#         import random
#         dwnsample = np.array(random.sample(range(len(y)), int(len(y)/2)))
#         x = np.array(x)[dwnsample]
#         y = np.array(y)[dwnsample]

        logging.info('Constructing model...')
        dataGen = Models.Generator(x, y, args.max_len, batch_size=64,
                                   norm_raw=bool(args.norm_raw))
        deepmased = Models.deepmased(config)
        tb_logs = keras.callbacks.TensorBoard(log_dir=os.path.join(save_path, 'logs_final'), 
                                              histogram_freq=0, 
                                              write_graph=True, write_images=True)
        
        logging.info('Training network...')
        deepmased.net.fit_generator(generator=dataGen,
                                    epochs=args.n_epochs, 
                                    use_multiprocessing=args.n_procs > 1,
                                    workers=args.n_procs,
                                    verbose=2,
                                    callbacks=[tb_logs, deepmased.reduce_lr])
            
        logging.info('Saving trained model...')
        x = [args.save_name, args.technology, 'model.h5']
        outfile = os.path.join(save_path, '_'.join(x))
        deepmased.save(outfile)
        logging.info('  File written: {}'.format(outfile))        
        x = [args.save_name, args.technology, 'mean_std.pkl']
        outfile = os.path.join(save_path, '_'.join(x))
        with open(outfile, 'wb') as f:
            pickle.dump([dataGen.mean, dataGen.std], f)
        logging.info('  File written: {}'.format(outfile))