Esempio n. 1
0
def run_parameter_sweep(parameters,data,args,Beta):
    output = []
    num_processes = torch.cuda.device_count()
    batches = int(len(parameters) / num_processes)
    idx = 0
    objectives = []
    bdivs = []
    val_objectives = []
    val_bdivs = []
    nsigs = []
    times = []
    while idx <= len(parameters)-num_processes:
        print(idx)
        pipe_list = []
        processes = []
        for rank in range(num_processes):
            recv_end, send_end = mp.Pipe(False)
            p = mp.Process(target=run_method_engine, args=(data, parameters.iloc[idx+rank]['a'], parameters.iloc[idx+rank]['phi'], parameters.iloc[idx+rank]['b'], Beta,
                                                   args.prior_on_W, args.prior_on_H, parameters.iloc[idx+rank]['K0'], args.tolerance, args.max_iter, args.use_val_set, send_end, rank,))
            pipe_list.append(recv_end)
            processes.append(p)
            p.start()

        result_list = [x.recv() for x in pipe_list]
        for p in processes:
            p.join()
        nsig = [write_output(x[0],x[1],x[2],data.channel_names,data.sample_names,args.output_dir,
                      parameters['label'][idx+i]) for i,x in enumerate(result_list)]
        [nsigs.append(ns) for i,ns in enumerate(nsig)]
        [objectives.append(obj[3]) for i,obj in enumerate(result_list)]
        [bdivs.append(obj[4]) for i,obj in enumerate(result_list)]
        [val_objectives.append(obj[5]) for i,obj in enumerate(result_list)]
        [val_bdivs.append(obj[6]) for i,obj in enumerate(result_list)]
        [times.append(time[7]) for i,time in enumerate(result_list)]
        idx += num_processes

    if idx < len(parameters):
        for i in range(len(parameters)-idx):
            idx+=i
            print(idx)
            W,H,mask,cost,bdiv,val_cost,val_bdiv,time = run_method_engine(data, parameters.iloc[idx]['a'], parameters.iloc[idx]['phi'], parameters.iloc[idx]['b'], Beta,
                                                   args.prior_on_W, args.prior_on_H, parameters.iloc[idx]['K0'], args.tolerance, args.max_iter, args.use_val_set)
            nsig = write_output(W,H,mask,data.channel_names,data.sample_names,args.output_dir,
                      parameters['label'][idx])
            times.append(time)
            nsigs.append(nsig)
            objectives.append(cost)
            val_objectives.append(val_cost)
            bdivs.append(bdiv)
            val_bdivs.append(val_bdiv)
    parameters['nsigs'] = nsigs
    parameters['objective_trainset'] = objectives
    parameters['bdiv_trainset'] = bdivs
    parameters['objective_valset'] = val_objectives
    parameters['bdiv_valset'] = val_bdivs
    parameters['times'] = times
    parameters.to_csv(args.output_dir + '/parameters_with_results.txt',sep='\t',index=None)
def run_parameter_sweep(parameters,dataset,args,Beta):
    output = []
    objectives = []
    nsigs = []
    times = []

    for idx in range(len(parameters)):
        data = ARD_NMF(dataset,args.objective)
        W,H,cost,time = run_method_engine(data, args.a, args.phi, args.b, Beta, 
                                                args.prior_on_W, args.prior_on_H, args.K0, args.tolerance,args.max_iter)
        nsig = write_output(W,H,data.channel_names,data.sample_names,args.output_dir,
                    args.output_prefix + "_" + parameters['label'][idx])
        times.append(time)
        nsigs.append(nsig)
        objectives.append(cost)
    parameters['nsigs'] = nsigs
    parameters['objective'] = objectives
    parameters['times'] = times
    parameters.to_csv(args.output_dir + '/' + args.output_prefix + '_results.txt',sep='\t',index=None)
def main():
    ''' Run ARD NMF'''
    torch.multiprocessing.set_start_method('spawn')

    parser = argparse.ArgumentParser(
        description='NMF with some sparsity penalty described https://arxiv.org/pdf/1111.6085.pdf')
    parser.add_argument('--data', help='Data Matrix', required=True)
    parser.add_argument('--feather', help='Input in feather format', required=False, default=False, action='store_true')
    parser.add_argument('--parquet', help='Input in parquet format', required=False, default=False, action='store_true')
    parser.add_argument('--K0', help='Initial K parameter', required=False, default=None, type=int)
    parser.add_argument('--max_iter', help='maximum iterations', required=False, default=10000, type=int)
    parser.add_argument('--del_', help='Early stop condition based on lambda change', required=False, default=1,
                        type=int)
    parser.add_argument('--tolerance', help='Early stop condition based on max lambda entry', required=False, default=1e-6,
                        type=float)
    parser.add_argument('--phi', help='dispersion parameter see paper for discussion of choosing phi '
                                      'default = 1', required=False, default=1.0, type=float)
    parser.add_argument('--a', help='Hyperparamter for lambda. We recommend trying various values of a. Smaller values'
                                    'will result in sparser results a good starting point might be'
                                    'a = log(F+N)', required=False, default=10.0,type=float)

    parser.add_argument('--b', help='Hyperparamter for lambda. Default used is as recommended in Tan and Fevotte 2012',
                        required = False,type=float, default = None)
    parser.add_argument('--objective',help='Defines the data objective. Choose between "poisson" or "gaussian". Defaults to Poisson',
                        required=False,default='poisson',type=str)

    parser.add_argument('--prior_on_W',help = 'Prior on W matrix "L1" (exponential) or "L2" (half-normal)'
                        ,required = False, default = 'L1',type=str)
    parser.add_argument('--prior_on_H',help = 'Prior on H matrix "L1" (exponential) or "L2" (half-normal)'
                        ,required = False, default = 'L1',type=str)

    parser.add_argument('--output_dir', help='output_file_name if run in array mode this correspond to the output directory', required=True)
    parser.add_argument('--output_prefix', help='Prefix for output files', required=False, default="result", type=str)
    parser.add_argument('--labeled', help='Input has row and column labels', required=False,default=False, action='store_true')
    parser.add_argument('--report_frequency', help='Number of iterations between progress reports', required=False,
                        default=100, type=int)
    parser.add_argument('--dtype', help='Floating point accuracy', required=False,
                        default='Float32', type=str)
    parser.add_argument('--parameters_file', help='allows running many different configurations of the NMF method on a multi'
                                                  'GPU system. To run in this mode provide this argument with a text file with '
                                                  'the following headers:(a,phi,b,prior_on_W,prior_on_H,Beta,label) label '
                                                  'indicates the output stem of the results from each run.', required = False
                                                    ,default = None)
    args = parser.parse_args()


    print('Reading data frame from '+ args.data)

    if args.dtype == 'Float32':
        args.dtype = torch.float32
    elif args.dtype == 'Float16':
        args.dtype = torch.float16
    
    if args.parquet:
        dataset = pd.read_parquet(args.data)
    elif args.feather:
        print('loading feather...')
        dataset = feather.read_dataframe(args.data)
    else:
        if args.labeled:
            dataset = pd.read_csv(args.data, sep='\t', header=0, index_col=0)
        else:
            dataset = pd.read_csv(args.data, sep='\t', header=None)


    if args.objective.lower() == 'poisson':
            Beta = 1
    elif args.objective.lower() == 'gaussian':
            Beta = 2
    else:
            print('objective parameter should be one of "gaussian" or "poisson"')
            sys.exit()
    if args.parameters_file != None:
        parameters = pd.read_csv(args.parameters_file,sep='\t')
        run_parameter_sweep(parameters,dataset,args,Beta)
    else:
        data = ARD_NMF(dataset,args.objective)
        W,H,cost,time = run_method_engine(data, args.a, args.phi, args.b, Beta, 
                                                   args.prior_on_W, args.prior_on_H, args.K0, args.tolerance,args.max_iter)
        nsig = write_output(W,H,data.channel_names,data.sample_names,args.output_dir,args.output_prefix)