def test(net, P, T, vP, vT, filename, epochs, mutation_rate = 0.05, population_size = 50):
    logger.info("Running genetic test for: " + filename + ' ' + str(epochs))
    print("\nTraining set:")
    print("Number of patients with events: " + str(T[:, 1].sum()))
    print("Number of censored patients: " + str((1 - T[:, 1]).sum()))
    print("\nValidation set:")
    if vP is not None and len(vP) > 0:
        print("Number of patients with events: " + str(vT[:, 1].sum()))
        print("Number of censored patients: " + str((1 - vT[:, 1]).sum()))
    else:
        print("Empty")


    outputs = net.sim(P)
    c_index = get_C_index(T, outputs)
    logger.info("C index test = " + str(c_index))

    try:
        net = train_evolutionary(net, (P, T), (vP, vT), epochs, error_function = c_index_error, population_size = population_size, mutation_chance = mutation_rate)

        outputs = net.sim(P)

    except FloatingPointError:
        print('Aaawww....')
    outputs = net.sim(P)
    c_index = get_C_index(T, outputs)
    logger.info("C index test = " + str(c_index))

    if vP is not None and len(vP) > 0:
        outputs = net.sim(vP)
        c_index = get_C_index(vT, outputs)
        logger.info("C index vald = " + str(c_index))

    return net
    def testGeneticCindexError(self):
        print("\nC Error")
        T = self.generateRandomTestData(1000)
        outputs = self.generateRandomTestData(1000)
        c_index = get_C_index(T, outputs)
        rand_error = c_index_error(T, outputs) / len(T)
        test_error = 1 / c_index
        print("rand_error = ", rand_error, "test value = ", test_error, "c_index = ", c_index)
        assert((rand_error - test_error) < 0.0001)

        T[:, 0] = np.arange(len(T))
        outputs = T
        rev_outputs = outputs[::-1]

        c_index = get_C_index(T, outputs)
        ord_error = c_index_error(T, outputs) / len(T)
        test_error = 1 / c_index
        print("ordered_error = ", ord_error, "test value = ", test_error, "c_index = ", c_index)
        assert(ord_error == test_error)

        c_index = get_C_index(T, rev_outputs)
        rev_error = c_index_error(T, rev_outputs) / len(T)
        #test_error = 1 / c_index #Will give zero-division, set to 9000
        test_error = 9000.0
        print("reversed_error = ", rev_error, "test value = ", test_error, "c_index = ", c_index)
        assert(rev_error == test_error)

        assert(ord_error < rev_error)

        T[:, 0] = np.arange(len(T))
        T[0, 1], T[-1, 1] = 1, 1 #Make sure they are non-censored
        outputs = T.copy()
        outputs[0], outputs[-1] = outputs[-1], outputs[0]
        rev_outputs = outputs[::-1]

        c_index = get_C_index(T, outputs)
        ord_error = c_index_error(T, outputs) / len(T)
        test_error = 1 / c_index
        print("1_off_error = ", ord_error, "test value = ", test_error, "c_index = ", c_index)
        assert(ord_error == test_error)

        assert(ord_error > 1)

        c_index = get_C_index(T, rev_outputs)
        rev_error = c_index_error(T, rev_outputs) / len(T)
        test_error = 1 / c_index
        print("1_off_reversed_error = ", rev_error, "test value = ", test_error, "c_index = ", c_index)
        assert(rev_error == test_error)

        assert(rev_error > 1)
def c_index_error(target, result):
    '''Used in genetic training.
    multiplied by length of target array because it is divided by the length of the target array in the genetic algorithm.'''
    #len(target) first to compensate for internals in genetic training
    #abs( - 0.5) to make both "positive" and "negative" C_index work, since they do
    C = get_C_index(target, result)

    return __inversed__(C, len(target))
Exemple #4
0
def test_model_arrays(savefile, filename, P, T, **kwargs):
    with open(savefile, 'r') as FILE:
        master_com = pickle.load(FILE)

    print("Committee size: {0}".format(len(master_com)))

    output_file = 'test_{0}_{1}.cvs'.format(os.path.splitext(os.path.basename(savefile))[0], \
                                                              os.path.splitext(os.path.basename(filename))[0])
    #Need double brackets for dimensions to be right for numpy
    outputs = numpy.array([[master_com.risk_eval(inputs)] for inputs in P])
    if T is None or len(T) == 0:
        with open(output_file, 'w') as F:
            #print('Targets\tOutputs\tEvents:')
            F.write("Outputs\n")
            for o in outputs:
                #print("{0}\t{1}\t{2}".format(t[0], o[0], t[1]))
                F.write("{0}\n".format(o[0]))
        return outputs

    c_index = get_C_index(T, outputs)

    print("C-Index: {0}".format(c_index))

    #if len(sys.argv) > 2:
    #    thresholds = [float(t) for t in sys.argv[2:]]
    #else:
    thresholds = None

    #Calculate suitable size for the figure for use in LaTEX
    fig_width_pt = 396.0  # Get this from LaTeX using \showthe\columnwidth
    inches_per_pt = 1.0 / 72.27  # Convert pt to inch
    golden_mean = (sqrt(5) - 1.0) / 2.0  # Aesthetic ratio
    fig_width = fig_width_pt * inches_per_pt  # width in inches
    fig_height = fig_width * golden_mean  # height in inches
    fig_size = [fig_width, fig_height]
    #Update settings
    plt.rcParams['figure.figsize'] = fig_size

    th = kaplanmeier(time_array=T[:, 0],
                     event_array=T[:, 1],
                     output_array=outputs,
                     threshold=thresholds,
                     show_plot=False,
                     bestcut=False,
                     **kwargs)
    #print("Threshold dividing the set in two equal pieces: " + str(th))
    if plt:
        plt.savefig('kaplanmeier_{0}_{1}.eps'.format(os.path.splitext(os.path.basename(savefile))[0], \
                                             os.path.splitext(os.path.basename(filename))[0]))

    with open(output_file, 'w') as F:
        #print('Targets\tOutputs\tEvents:')
        F.write("Targets,Outputs,Events\n")
        for t, o in zip(T, outputs):
            #print("{0}\t{1}\t{2}".format(t[0], o[0], t[1]))
            F.write("{0},{1},{2}\n".format(t[0], o[0], t[1]))

    return output_file
def main(model, test_data, test_targets, column_map):
    print(column_map)
    #First establish baseline c-index
    out = np.array([[model.risk_eval(inputs)] for inputs in test_data])
    base_cindex = get_C_index(test_targets, out)
    #Now we can calculate any changes. Do so now for each variable
    #TODO: make sure they are ordered correctly
    variable_changes = {}
    for var, i in column_map.iteritems():
        print("Checking {}, {}".format(i, var))
        #Make a copy of the data set so we can modify the variable
        temp_data = test_data.copy()
        #Set this variable to zero
        temp_data[:,i] = 1
        #Generate output and calc c-index. Also increase by 100
        out = np.array([[model.risk_eval(inputs)] for inputs in temp_data])
        variable_changes[var] = 100*(base_cindex - get_C_index(test_targets, out))

    #All variables completed. Return dictionary
    return variable_changes
def survival_stat(filename, thresholds = None):
    data = np.array(read_data_file(filename, ","))
    D, t = parse_data(data, inputcols = (2, 3, 4, 5, 6, 7, 8, 9, 10), ignorerows = [0], normalize = False)

    T = D[:, (2, 3)]
    outputs = D[:, (-1, 3)]
    C = get_C_index(T, outputs)

    print("C-index: " + str(C))
    print("Genetic error: " + str(1 / C))

    th = kaplanmeier(D, 2, 3, -1, threshold = thresholds)
    print("Threshold dividing the set in two equal pieces: " + str(th))
    if plt:
        plt.show()
def experiment(net, P, T, vP, vT, filename, epochs, learning_rate):
    logger.info("Running experiment for: " + filename + ' ' + str(epochs) + ", rate: " + str(learning_rate))
    print("Number of patients with events: " + str(T[:, 1].sum()))
    print("Number of censored patients: " + str((1 - T[:, 1]).sum()))

    timeslots = generate_timeslots(T)

    try:
        net = traingd(net, (P, T), (vP, vT), epochs, learning_rate, block_size = 100, error_module = cox_error)
    except FloatingPointError:
        print('Aaawww....')
    outputs = net.sim(P)
    c_index = get_C_index(T, outputs)
    logger.info("C index = " + str(c_index))

    #plot_network_weights(net)

    kaplanmeier(time_array = T[:, 0], event_array = T[:, 1], output_array = outputs[:, 0])
    if vP is not None and len(vP) > 0:
        outputs = net.sim(vP)
        kaplanmeier(time_array = vT[:, 0], event_array = vT[:, 1], output_array = outputs[:, 0])

    return net
def com_cross():

    filename = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset.txt"

    #try:
    #    columns = input("Which columns to include? (Do NOT forget trailing comma if only one column is used, e.g. '3,'\nAvailable columns are: 2, -4, -3, -2, -1. Just press ENTER for all columns.\n")
    #except SyntaxError:
    #if len(sys.argv) < 3:
    columns = (2, -4, -3, -2, -1)
    #else:
    #    columns = [int(col) for col in sys.argv[2:]]

    print('\nIncluding columns: ' + str(columns))

    P, T = parse_file(filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True)
    #remove tail censored
    #print('\nRemoving tail censored...')
    #P, T = copy_without_censored(P, T)

    #Divide into validation sets
    #test_size = 0.33
    #print('Size of test set (not used in training): ' + str(test_size))
    #((TP, TT), (VP, VT)) = get_validation_set(P, T, validation_size = test_size, binary_column = 1)

    print("\nData set:")
    print("Number of patients with events: " + str(T[:, 1].sum()))
    print("Number of censored patients: " + str((1 - T[:, 1]).sum()))

    #print("Length of training set: " + str(len(TP)))
    #print("Length of test set: " + str(len(VP)))

    #try:
    #    comsize = input("Number of networks to cross-validate [10]: ")
    #except SyntaxError:
    if len(sys.argv) < 2:
        netsize = 1
    else:
        netsize = int(sys.argv[1])
    print("\nNumber of hidden nodes: " + str(netsize))
    comsize = 4
    print('Number of members in each committee: ' + str(comsize))
    comnum = 5
    print('Number of committees to cross-validate: ' + str(comnum))

    times_to_cross = 3
    print('Number of times to repeat cross-validation: ' + str(times_to_cross))

    #try:
    #    pop_size = input('Population size [50]: ')
    #except SyntaxError as e:
    pop_size = 100
    print("Population size: " + str(pop_size))

    #try:
    #    mutation_rate = input('Please input a mutation rate (0.25): ')
    #except SyntaxError as e:
    mutation_rate = 0.05
    print("Mutation rate: " + str(mutation_rate))

    #try:
    #    epochs = input("Number of generations (200): ")
    #except SyntaxError as e:
    epochs = 100
    print("Epochs: " + str(epochs))

    for _cross_time in xrange(times_to_cross):

        data_sets = get_cross_validation_sets(P, T, comnum , binary_column = 1)

        print('\nTest Errors, Validation Errors:')

        for _com_num, (TS, VS) in zip(xrange(comnum), data_sets):
            com = build_feedforward_committee(comsize, len(P[0]), netsize, 1, output_function = 'linear')

            #1 is the column in the target array which holds the binary censoring information
            test_errors, vald_errors, internal_sets = train_committee(com, train_evolutionary, TS[0], TS[1], 1, epochs, error_function = c_index_error, population_size = pop_size, mutation_chance = mutation_rate)

            com.set_training_sets([set[0][0] for set in internal_sets]) #first 0 gives training sets, second 0 gives inputs.

            outputs = numpy.array([[com.risk_eval(inputs)] for inputs in TS[0]]) #Need double brackets for dimensions to be right for numpy
            train_c_index = get_C_index(TS[1], outputs)
            outputs = numpy.array([[com.risk_eval(inputs)] for inputs in VS[0]]) #Need double brackets for dimensions to be right for numpy
            val_c_index = get_C_index(VS[1], outputs)

            print(str(1.0 / train_c_index) + ", " + str(1.0 / val_c_index))
def train_model(filename, columns, targets, separator = '\t', comsize=1):
    '''
    train_model(design, filename, columns, targets)

    Given a design, will train a committee like that on the data specified. Will save the committee as
    '.design_time.pcom' where design is replaced by the design and time is replaced by a string of numbers from time()
    Returns this filename
    '''
    headers = []
    headers.extend(columns)
    headers.extend(targets) #Add targets to the end

    targetcol = targets[0]
    eventcol = targets[1]

    savefile = ".cox_{time:.0f}.pcom".format(time = time.time())

    print('\nIncluding columns: ' + str(columns))
    print('Target columns: ' + str(targets))

    P, T = parse_file(filename, targetcols = targets, inputcols = columns, normalize = False, separator = separator, use_header = True)

    #columns = (2, -6, -5, -4, -3, -2, -1)
    #_P, T = parse_file(filename, targetcols = [4, 5], inputcols = (2, -4, -3, -2, -1), ignorerows = [0], normalize = True)
    #P, _T = parse_file(filename, targetcols = [4], inputcols = columns, ignorerows = [0], normalize = True)

    print("\nData set:")
    print("Number of patients with events: " + str(T[:, 1].sum()))
    print("Number of censored patients: " + str((1 - T[:, 1]).sum()))

    print('Number of members in the committee: ' + str(comsize))

    allpats = P.copy()
    #allpats[:, 1] = 1 #This is the event column

    allpats_targets = T

    patvals = [[] for bah in xrange(len(allpats))]

    cox_committee = None

    #Get an independant test set, 1/tau of the total.
    super_set = get_cross_validation_sets(P, T, 1, binary_column = 1)

    #For every blind test group
    for ((TRN, TEST), _t) in zip(super_set, xrange(len(super_set))):
        TRN_INPUTS = TRN[0]
        TRN_TARGETS = TRN[1]
        #TEST_INPUTS = TEST[0]
        #TEST_TARGETS = TEST[1]

        #Modulo expressions mean we can deal with any number of committees, not only multiples of three
        _res = 1 if comsize == 1 else 0
        for com_num in xrange(int(comsize / 3) + int((comsize % 3) / 2) + _res):
            #Every time in the loop, create new validations sets of size 1/3. 3 everytime
            _tmp_val_sets = get_cross_validation_sets(TRN_INPUTS, TRN_TARGETS, 3, binary_column = 1)
            val_sets = []
	    if int(comsize / 3) > 0:
                _max = 3
            else:
                _max = int((comsize % 3) / 2) * 2 + _res
	    for _tmp_val_set in _tmp_val_sets[:_max]:
                ((trn_in, trn_tar), (val_in, val_tar)) = _tmp_val_set
                #Add target columns to the end
                _trn = np.append(trn_in, trn_tar, axis = 1)
                _val = np.append(val_in, val_tar, axis = 1)
                val_sets.append((_trn, _val))

            #And create 3 cox models, one for each validation
            tmp_com = committee(val_sets, targetcol, eventcol, headers)
	    print("Adding this many members: " + str(len(tmp_com)))
            if cox_committee is None:
                cox_committee = tmp_com
            else:
                #Extend the big committee
                cox_committee.members.extend(tmp_com.members)


    #Now what we'd like to do is get the value for each patient in the
    #validation set, for all validation sets. Then I'd like to average the
    #result for each such patient, over the different validation sets.
    print("Validating cox committee, this might take a little while...")
    _count = 0
    if len(cox_committee) < 3:
        allpats_targets = np.empty((0, 2)) #All patients won't be in the target set in this case
    for pat, i in zip(allpats, xrange(len(patvals))):
        if _count % 50 == 0:
            print("{0} / {1}".format(_count, len(patvals)))
        _count += 1
        #We could speed this up by only reading every third dataset, but I'm not sure if they are ordered correctly...
        for cox in cox_committee.members:
            (_trn, _val) = cox.internal_set
            trn_in = _trn[:, :-2] #Last two columns are targets
            val_in = _val[:, :-2]
            val_tar = _val[:, -2:]
            for valpat, valtar in zip(val_in, val_tar):
                if (pat == valpat).all(): #Checks each variable individually, all() does a boolean and between the results
                    patvals[i].append(cox_committee.risk_eval(pat, cox = cox)) #Just to have something to count
                    if len(cox_committee) < 3:
                        allpats_targets = np.append(allpats_targets, [valtar], axis = 0)
                    #print cox_committee.risk_eval(pat, cox = cox)
                    break #Done with this data_set

    avg_vals = []
    for patval in patvals:
        if len(patval) > 0:
            avg_vals.append([np.mean(patval)])
    avg_vals = np.array(avg_vals)
    #avg_vals = np.array([[np.mean(patval)] for patval in patvals]) #Need  double brackets for dimensions to fit C-module
    #Now we have average validation ranks. do C-index on this
    avg_val_c_index = get_C_index(allpats_targets, avg_vals)
    print('Average validation C-Index: {0}'.format(avg_val_c_index))
    print('Saving committee in {0}'.format(savefile))
    with open(savefile, 'w') as FILE:
        pickle.dump(cox_committee, FILE)

    return savefile
def main(design, **train_kwargs):
    #glogger.setLoggingLevel(glogger.debug)    
    
    #FAKE
    filename = "/home/gibson/jonask/Projects/DataMaker/hard_survival_test_noisyindata.txt"
    filename_val = "/home/gibson/jonask/Projects/DataMaker/hard_survival_test_val_noisyindata.txt"
    #filename = "/home/gibson/jonask/Projects/DataMaker/hard_survival_test.txt"
    #filename_val = "/home/gibson/jonask/Projects/DataMaker/hard_survival_test_val.txt"
    columns = ('X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6',  'X7', 'X8', 'X9')
    #columns = ('X0', 'X1', 'X2', 'X3', 'X4', 'X5')
    targets = ['censnoisytime', 'event']
    #targets = ['censtime', 'event']
    #targets = ['time', 'event1']

    P, T = parse_file(filename, targetcols = targets, inputcols = columns, normalize = True, separator = '\t', use_header = True)
    Pval, Tval = parse_file(filename_val, targetcols = targets, inputcols = columns, normalize = True, separator = '\t', use_header = True)

    #--------------------------------------

    #REAL    
    #filename = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_the_n4369_dataset_with_logs_lymf.txt"
    #columns = ('age', 'log(1+lymfmet)', 'n_pos', 'tumsize', 'log(1+er_cyt)', 'log(1+pgr_cyt)', 'pgr_cyt_pos', 
    #           'er_cyt_pos', 'size_gt_20', 'er_cyt_pos', 'pgr_cyt_pos')
    #targets = ['time_10y', 'event_10y']
    #P, T = parse_file(filename, targetcols = targets, inputcols = columns, normalize = True, separator = '\t', use_header = True)
    #Pval, Tval = None, None

    #--------------------------------------    
    
    print('\nIncluding columns: ' + str(columns))
    print('Target columns: ' + str(targets))

    print("\nData set:")
    print("Number of patients with events: " + str(T[:, 1].sum()))
    print("Number of censored patients: " + str((1 - T[:, 1]).sum()))
        
    for k, v in train_kwargs.iteritems():
        print(str(k) + ": " + str(v))
        
    errorfunc = c_index_error
    
    print("\nError function: " + errorfunc.__name__)
    
    print("\nDesign: " + str(design))
    layers = []
    hidden_func = design[-1]
    for layer_size in design[:-1]:
        layers.append(layer_size)

    net = build_feedforward_multilayered(input_number = len(P[0]), hidden_numbers = layers, output_number = 1, hidden_function = hidden_func, output_function = "linear")
    #net = build_feedforward(3, len(P[0]), netsize, 1, hidden_function = hidden_func, output_function = 'linear')

    #set_specific_starting_weights(net)    
    
    best_net = train_evolutionary(net, (P, T), (Pval, Tval), binary_target = 1, error_function = c_index_error, **train_kwargs)
    
    cens_output = []
    
    results = best_net.sim(P)
    best_net.trn_set = results[:, 0] #To get rid of extra dimensions
    #Now sort the set
    best_net.trn_set = numpy.sort(best_net.trn_set)
    
    for pat in P:
        cens_output.append(risk_eval(best_net, pat))
    
    cens_output = numpy.array([[val] for val in cens_output])
    
    #Calc C-index
    c_index = get_C_index(T, cens_output)
    
    print("C-Index: {0}".format(c_index))
Exemple #11
0
def scatterplot_files(targetfile, targetcol, eventcol, modelfile,
                      modeloutputcol, **kwargs):
    '''
    scatterplot_files(targetfile, targetcol, eventcol, modelfile, modeloutputcol)

    Takes two files because the target data and model data is allowed to be in different files.
    Events are ONLY taken from target data.
    Writes two files:
        scatter_cens_targetfile_modelfile.eps
        scatter_nocens_targetfile_modelfile.eps
    '''

    #Calculate suitable size for the figure for use in LaTEX
    fig_width_pt = 396.0  # Get this from LaTeX using \showthe\columnwidth
    inches_per_pt = 1.0 / 72.27  # Convert pt to inch
    golden_mean = (sqrt(5) - 1.0) / 2.0  # Aesthetic ratio
    fig_width = fig_width_pt * inches_per_pt  # width in inches
    fig_height = fig_width * golden_mean  # height in inches
    fig_size = [fig_width, fig_height]
    #Update settings
    plt.rcParams['figure.figsize'] = fig_size
    #params = {'axes.labelsize': 10,
    #          'text.fontsize': 10,
    #          'legend.fontsize': 10,
    #          'xtick.labelsize': 8,
    #          'ytick.labelsize': 8,
    #'text.usetex': True,
    #          'figure.figsize': fig_size}
    #plt.rcParams.update(params)

    #    with open(targetfile, 'r') as f:
    #        X_in = [line.split() for line in f.readlines()]
    #    X_in = numpy.array(X_in)
    #    X = X_in[1:, first_col]
    #    X = numpy.array(X, dtype = 'float')

    data = np.array(read_data_file(targetfile, ","))
    T, t = parse_data(data,
                      inputcols=(targetcol, eventcol),
                      ignorerows=[0],
                      normalize=False)
    X = T[:, 0]
    events = T[:, 1]

    #    with open(modeloutputcol, 'r') as f:
    #        Y_in = [line.split() for line in f.readlines()]
    #
    #    Y_in = numpy.array(Y_in)
    #    Y = Y_in[1:, second_col]
    #    Y = numpy.array(Y, dtype = 'float')

    data = np.array(read_data_file(modelfile, ","))
    D, t = parse_data(data,
                      inputcols=[modeloutputcol],
                      ignorerows=[0],
                      normalize=False)
    Y = D[:, 0]
    #    if event_col is not None:
    #        events = X_in[1:, event_col]
    #        events = numpy.array(events, dtype = 'float')
    #        print 'Using events'
    #    else:
    #        events = None

    #    T = numpy.empty((len(X), 2), dtype='float')
    #    T[:, 0] = X
    #    T[:, 1] = events
    outputs = np.empty((len(X), 2), dtype='float')
    outputs[:, 0] = Y
    outputs[:, 1] = events
    c_index = get_C_index(T, outputs)
    print("C-Index between these files is: {0}".format(c_index))

    scatter(X,
            Y,
            events=events,
            x_label='Targets',
            y_label='Model output',
            gridsize=30,
            mincnt=0,
            show_plot=False)
    #plt.xlabel(os.path.basename(sys.argv[1]) + "\nC-Index between these files is: {0}".format(c_index))
    #plt.ylabel('Correlation of ' + os.path.basename(sys.argv[2]))

    plt.savefig('scatter_cens_cind_{cindex}_{0}_{1}.eps'.format(
        os.path.splitext(os.path.basename(modelfile))[0],
        os.path.splitext(os.path.basename(targetfile))[0],
        cindex=c_index))

    scatter(X,
            Y,
            x_label='Targets',
            y_label='Model output',
            gridsize=30,
            mincnt=0,
            show_plot=False)
    #plt.xlabel(os.path.basename(sys.argv[1]) + "\nC-Index between these files is: {0}".format(c_index))
    #plt.ylabel('Correlation of ' + os.path.basename(sys.argv[2]))

    plt.savefig('scatter_nocens_{cindex}_{0}_{1}.eps'.format(
        os.path.splitext(os.path.basename(modelfile))[0],
        os.path.splitext(os.path.basename(targetfile))[0],
        cindex=c_index))
def model_contest(filename, columns, targets, designs, comsize_third = 5, repeat_times = 20, testfilename = None, separator = '\t', **train_kwargs):
    '''
    model_contest(filename, columns, targets, designs)
    
    You must use column names! Here are example values for the input arguments:
        
    filename = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_the_n4369_dataset_with_logs_lymf.txt"
    columns = ('age', 'log(1+lymfmet)', 'n_pos', 'tumsize', 'log(1+er_cyt)', 'log(1+pgr_cyt)', 'pgr_cyt_pos',
               'er_cyt_pos', 'size_gt_20', 'er_cyt_pos', 'pgr_cyt_pos')
    targets = ['time', 'event']
    
    Writes the results to '.winningdesigns_time.csv' and returns the filename
    '''

    starting_time = time.time()
    fastest_done = None
    m = Master()

    #m.connect('gibson.thep.lu.se', 'science')
    m.connect('130.235.189.249', 'science')
    print('Connected to server')
    m.clear_queues()

    print('\nIncluding columns: ' + str(columns))
    print('\nTarget columns: ' + str(targets))

    P, T = parse_file(filename, targetcols = targets, inputcols = columns, normalize = True, separator = separator,
                      use_header = True)

    if testfilename is not None:
        Ptest, Ttest = parse_file(testfilename, targetcols = targets, inputcols = columns, normalize = True, separator = separator,
                      use_header = True)
    else:
        Ptest, Ttest = None, None

    print("\nData set:")
    print("Number of patients with events: " + str(T[:, 1].sum()))
    print("Number of censored patients: " + str((1 - T[:, 1]).sum()))
    print("T:" + str(T.shape))
    print("P:" + str(P.shape))
    if (Ptest is not None and Ttest is not None):
        print("\nExternal Test Data set:")
        print("Number of patients with events: " + str(Ttest[:, 1].sum()))
        print("Number of censored patients: " + str((1 - Ttest[:, 1]).sum()))
        print("Ttest:" + str(Ttest.shape))
        print("Ptest:" + str(Ptest.shape))

    comsize = 3 * comsize_third #Make sure it is divisible by three
    print('\nNumber of members in each committee: ' + str(comsize))

    print('Designs used in testing (size, function): ' + str(designs))

    # We can generate a test set from the data set, but usually we don't want that
    # Leave at 1 for no test set.
    val_pieces = 1
    print('Cross-test pieces: ' + str(val_pieces))

    cross_times = repeat_times
    print('Number of times to repeat procedure: ' + str(cross_times))

    #try:
    #    pop_size = input('Population size [50]: ')
    #except SyntaxError as e:
    if 'population_size' not in train_kwargs:
        train_kwargs['population_size'] = 50

    #try:
    #    mutation_rate = input('Please input a mutation rate (0.25): ')
    #except SyntaxError as e:
    if 'mutation_chance' not in train_kwargs:
        train_kwargs['mutation_chance'] = 0.25

    #try:
    #    epochs = input("Number of generations (200): ")
    #except SyntaxError as e:
    if 'epochs' not in train_kwargs:
        train_kwargs['epochs'] = 100

    for k, v in train_kwargs.iteritems():
        print(str(k) + ": " + str(v))

    print('\n Job status:\n')

    count = 0
    all_counts = []
    all_jobs = {}

    tests = {}
    #trn_set = {}
    trn_idx = {}
    all_best = []
    all_best_com_val = []
    all_best_avg_trn = []
    all_best_avg_val = []
    all_best_design = []
    all_best_test = []

    #Lambda times
    for _time in xrange(cross_times):
        #Get an independant test set, 1/tau of the total.
        super_set, super_indices = get_cross_validation_sets(P, T, val_pieces , binary_column = 1, return_indices = True)
        super_zip = zip(super_set, super_indices)

        all_best.append({})
        all_best_com_val.append({})
        all_best_avg_trn.append({})
        all_best_avg_val.append({})
        all_best_design.append({})
        all_best_test.append({})

        best = all_best[_time]
        best_com_val = all_best_com_val[_time]
        best_avg_trn = all_best_avg_trn[_time]
        best_avg_val = all_best_avg_val[_time]
        best_design = all_best_design[_time]
        best_test = all_best_test[_time]


        #For every blind test group
        for (((TRN, TEST), (TRN_IDX, TEST_IDX)), _t) in zip(super_zip, xrange(len(super_set))):
            TRN_INPUTS = TRN[0]
            TRN_TARGETS = TRN[1]
            TEST_INPUTS = TEST[0]
            TEST_TARGETS = TEST[1]

            #run each architecture design on a separate machine
            best[_t] = None
            best_com_val[_t] = 0
            best_avg_trn[_t] = 0
            best_avg_val[_t] = 0
            best_design[_t] = None
            best_test[_t] = None

            for design in designs:
                count += 1
                all_counts.append(count)

                (netsize, hidden_func) = design

                com = build_feedforward_committee(comsize, len(P[0]), netsize, 1, hidden_function = hidden_func,
                                                  output_function = 'linear')

                tests[count] = (TEST_INPUTS, TEST_TARGETS)
                #trn_set[count] = (TRN_INPUTS, TRN_TARGETS)
                #print("TRN_IDX" + str(TRN_IDX))
                #print("TEST_IDX" + str(TEST_IDX))
                trn_idx[count] = TRN_IDX

                #1 is the column in the target array which holds the binary censoring information

                job = m.assemblejob((count, _time, _t, design),
                        train_committee, com, train_evolutionary, TRN_INPUTS,
                        TRN_TARGETS, binary_target = 1, error_function = c_index_error, **train_kwargs)

                all_jobs[count] = job

                m.sendjob(job[0], job[1], *job[2], **job[3])

    while(count > 0):
        print('Remaining jobs: {0}'.format(all_counts))
        if fastest_done is None:
            ID, RESULT = m.getresult() #Blocks
            fastest_done = time.time() - starting_time
        else:
            RETURNVALUE = m.get_waiting_result(2 * fastest_done)
            if RETURNVALUE is not None:
                ID, RESULT = RETURNVALUE
            else:
                print('Timed out after {0} seconds. Putting remaining jobs {1} back on the queue.\n \
                You should restart the server after this session.'.format(fastest_done, all_counts))
                for _c in all_counts:
                    job = all_jobs[_c]
                    m.sendjob(job[0], job[1], *job[2], **job[3])
                continue #Jump to next iteration

        print('Result received! Processing...')
        _c, _time, _t, design = ID

        (com, trn_errors, vald_errors, internal_sets, internal_sets_indices) = RESULT

        if _c not in all_counts:
            print('This result [{0}] has already been processed.'.format(_c))
            continue

        count -= 1

        TEST_INPUTS, TEST_TARGETS = tests[_c]
        #TRN_INPUTS, TRN_TARGETS = trn_set[_c]
        TRN_IDX = trn_idx[_c]

        all_counts.remove(_c)

        com.set_training_sets([_set[0][0] for _set in internal_sets]) #first 0 gives training sets, second 0 gives inputs.

        #Now what we'd like to do is get the value for each patient in the
        #validation set, for all validation sets. Then I'd like to average the
        #result for each such patient, over the different validation sets.

        allpats = []
        allpats.extend(internal_sets[0][0][0]) #Extend with training inputs
        allpats.extend(internal_sets[0][1][0]) #Extend with validation inputs

        allpats_targets = []
        allpats_targets.extend(internal_sets[0][0][1]) #training targets
        allpats_targets.extend(internal_sets[0][1][1]) #validation targets
        allpats_targets = numpy.array(allpats_targets)

        patvals = [[] for bah in xrange(len(allpats))]

        #print(len(patvals))
        #print(len(internal_sets_indices))
        #1 for the validation set. Was given to the com.nets in the same type of iteration, so order is same
        # Will be order consistent with P and T
        for ((trn_in, trn_tar), (val_in, val_tar)), idx, net in zip(internal_sets, internal_sets_indices, com.nets):
            _C_ = -1
            for valpat in val_in:
                _C_ += 1
                i = TRN_IDX[idx[1][_C_]]
                pat = P[i]
                #print("Facit: \n" + str(valpat))
                #print("_C_ = " + str(_C_))
                #print("i: " + str(i))
                #print("P[TRN_IDX[i]] : " + str(pat))
                assert((pat == valpat).all())
                patvals[i].append(com.risk_eval(pat, net = net))

        #Need  double brackets for dimensions to fit C-module
        avg_vals = numpy.array([[numpy.mean(patval)] for patval in patvals])
        #Now we have average validation ranks. do C-index on this
        avg_val_c_index = get_C_index(T, avg_vals)

        trn_errors = numpy.array(trn_errors.values(), dtype = numpy.float64) ** -1
        vald_errors = numpy.array(vald_errors.values(), dtype = numpy.float64) ** -1
        avg_trn = numpy.mean(trn_errors)
        avg_val = numpy.mean(vald_errors)

        best = all_best[_time]
        best_com_val = all_best_com_val[_time]
        best_avg_trn = all_best_avg_trn[_time]
        best_avg_val = all_best_avg_val[_time]
        best_design = all_best_design[_time]
        best_test = all_best_test[_time]

        if avg_val_c_index > best_com_val[_t]:
            best[_t] = com
            best_com_val[_t] = avg_val_c_index
            best_avg_trn[_t] = avg_trn
            best_avg_val[_t] = avg_val
            best_design[_t] = design
            best_test[_t] = tests[_c]


    print('\nWinning designs')
    winnerfilename = '.winningdesigns_{0:.0f}.csv'.format(time.time())
    with open(winnerfilename, 'w') as F:
        print('Average Training Perf, Average Validation Perf, Average Committee Validation Perf, Test Perf, Design:')
        F.write('Average Training Perf, Average Validation Perf, Average Committee Validation Perf, Test Perf, Design\n')
        for _time in xrange(len(all_best)):
            best = all_best[_time]
            best_com_val = all_best_com_val[_time]
            best_avg_trn = all_best_avg_trn[_time]
            best_avg_val = all_best_avg_val[_time]
            best_design = all_best_design[_time]
            best_test = all_best_test[_time]
            for _t in best.keys():
                TEST_INPUTS, TEST_TARGETS = best_test[_t]
                com = best[_t]

                if len(TEST_INPUTS) > 0:
                    #Need double brackets for dimensions to be right for numpy
                    outputs = numpy.array([[com.risk_eval(inputs)] for inputs in TEST_INPUTS])
                    test_c_index = get_C_index(TEST_TARGETS, outputs)
                elif Ptest is not None and Ttest is not None:
                    #Need double brackets for dimensions to be right for numpy
                    outputs = numpy.array([[com.risk_eval(inputs)] for inputs in Ptest])
                    test_c_index = get_C_index(Ttest, outputs)
                else:
                    test_c_index = 0

                print('{trn}, {val}, {com_val}, {test}, {dsn}'.format(trn = best_avg_trn[_t], val = best_avg_val[_t],
                      com_val = best_com_val[_t], test = test_c_index, dsn = best_design[_t]))
                F.write('{trn}, {val}, {com_val}, {test}, {dsn}\n'.format(trn = best_avg_trn[_t], val = best_avg_val[_t],
                        com_val = best_com_val[_t], test = test_c_index, dsn = best_design[_t]))

    return winnerfilename
def committee_test():

    try:
        netsize = input('Number of hidden nodes? [1]: ')
    except SyntaxError as e:
        netsize = 1

    try:
        comsize = input('Committee size? [1]: ')
    except SyntaxError as e:
        comsize = 1

    try:
        pop_size = input('Population size? [100]: ')
    except SyntaxError as e:
        pop_size = 100

    try:
        mutation_rate = input('Please input a mutation rate (0.05): ')
    except SyntaxError as e:
        mutation_rate = 0.05

    filename = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset.txt"

    try:
        columns = input("Which columns to include? (Do NOT forget trailing comma if only one column is used, e.g. '3,'\nAvailable columns are: 2, -4, -3, -2, -1. Just press ENTER for all columns.\n")
    except SyntaxError:
        columns = (2, -4, -3, -2, -1)

    P, T = parse_file(filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True)

    #remove tail censored
    try:
        cutoff = input('Cutoff for censored data? [9999 years]: ')
    except SyntaxError as e:
        cutoff = 9999
    P, T = copy_without_censored(P, T, cutoff)

    #Divide into validation sets
    try:
        test_size = float(input('Size of test set (not used in training)? Input in fractions. Default is [0.0]: '))
    except:
        test_size = 0.0
    ((TP, TT), (VP, VT)) = get_validation_set(P, T, validation_size = test_size, binary_column = 1)
    print("Length of training set: " + str(len(TP)))
    print("Length of test set: " + str(len(VP)))

    try:
        epochs = input("\nNumber of generations (1): ")
    except SyntaxError as e:
        epochs = 1

    com = build_feedforward_committee(comsize, len(P[0]), netsize, 1, output_function = 'linear')

    #1 is the column in the target array which holds the binary censoring information
    test_errors, vald_errors, data_sets = train_committee(com, train_evolutionary, P, T, 1, epochs, error_function = c_index_error, population_size = pop_size, mutation_chance = mutation_rate)

    com.set_training_sets([set[0][0] for set in data_sets]) #first 0 gives training sets, second 0 gives inputs.

    print('\nTest C_indices, Validation C_indices:')
    for terr, verr in zip(test_errors.values(), vald_errors.values()):
        print(str(1 / terr) + ", " + str(1 / verr))

    if plt:
        outputs = numpy.array([[com.risk_eval(inputs)] for inputs in TP]) #Need double brackets for dimensions to be right for numpy
        kaplanmeier(time_array = TT[:, 0], event_array = TT[:, 1], output_array = outputs[:, 0], threshold = 0.5)
        train_c_index = get_C_index(TT, outputs)
        print("\nC-index on the training set: " + str(train_c_index))
        if len(VP) > 0:
            outputs = numpy.array([[com.risk_eval(inputs)] for inputs in VP]) #Need double brackets for dimensions to be right for numpy
            test_c_index = get_C_index(VT, outputs)
            kaplanmeier(time_array = VT[:, 0], event_array = VT[:, 1], output_array = outputs[:, 0], threshold = 0.5)
            print("C-index on the test set: " + str(test_c_index))

        #raw_input("\nPress enter to show plots...")
        plt.show()

    try:
        answer = input("\nDo you wish to print committee risk output? ['n']: ")
    except (SyntaxError, NameError):
        answer = 'n'

    if answer != 'n' and answer != 'no':
        inputs = read_data_file(filename)
        P, T = parse_file(filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True)
        outputs = [[com.risk_eval(patient)] for patient in P]
        while len(inputs) > len(outputs):
            outputs.insert(0, ["net_output"])

        print("\n")
        for rawline in zip(inputs, outputs):
            line = ''
            for col in rawline[0]:
                line += str(col)
                line += ','
            for col in rawline[1]:
                line += str(col)

            print(line)
Exemple #14
0
def model_contest(filename,
                  columns,
                  targets,
                  designs,
                  comsize_third=5,
                  repeat_times=20,
                  testfilename=None,
                  separator='\t',
                  **train_kwargs):
    '''
    model_contest(filename, columns, targets, designs)
    
    You must use column names! Here are example values for the input arguments:
        
    filename = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_the_n4369_dataset_with_logs_lymf.txt"
    columns = ('age', 'log(1+lymfmet)', 'n_pos', 'tumsize', 'log(1+er_cyt)', 'log(1+pgr_cyt)', 'pgr_cyt_pos',
               'er_cyt_pos', 'size_gt_20', 'er_cyt_pos', 'pgr_cyt_pos')
    targets = ['time', 'event']
    
    Writes the results to '.winningdesigns_time.csv' and returns the filename
    '''

    starting_time = time.time()
    fastest_done = None
    m = Master()

    #m.connect('gibson.thep.lu.se', 'science')
    m.connect('130.235.189.249', 'science')
    print('Connected to server')
    m.clear_queues()

    print('\nIncluding columns: ' + str(columns))
    print('\nTarget columns: ' + str(targets))

    P, T = parse_file(filename,
                      targetcols=targets,
                      inputcols=columns,
                      normalize=True,
                      separator=separator,
                      use_header=True)

    if testfilename is not None:
        Ptest, Ttest = parse_file(testfilename,
                                  targetcols=targets,
                                  inputcols=columns,
                                  normalize=True,
                                  separator=separator,
                                  use_header=True)
    else:
        Ptest, Ttest = None, None

    print("\nData set:")
    print("Number of patients with events: " + str(T[:, 1].sum()))
    print("Number of censored patients: " + str((1 - T[:, 1]).sum()))
    print("T:" + str(T.shape))
    print("P:" + str(P.shape))
    if (Ptest is not None and Ttest is not None):
        print("\nExternal Test Data set:")
        print("Number of patients with events: " + str(Ttest[:, 1].sum()))
        print("Number of censored patients: " + str((1 - Ttest[:, 1]).sum()))
        print("Ttest:" + str(Ttest.shape))
        print("Ptest:" + str(Ptest.shape))

    comsize = 3 * comsize_third  #Make sure it is divisible by three
    print('\nNumber of members in each committee: ' + str(comsize))

    print('Designs used in testing (size, function): ' + str(designs))

    # We can generate a test set from the data set, but usually we don't want that
    # Leave at 1 for no test set.
    val_pieces = 1
    print('Cross-test pieces: ' + str(val_pieces))

    cross_times = repeat_times
    print('Number of times to repeat procedure: ' + str(cross_times))

    #try:
    #    pop_size = input('Population size [50]: ')
    #except SyntaxError as e:
    if 'population_size' not in train_kwargs:
        train_kwargs['population_size'] = 50

    #try:
    #    mutation_rate = input('Please input a mutation rate (0.25): ')
    #except SyntaxError as e:
    if 'mutation_chance' not in train_kwargs:
        train_kwargs['mutation_chance'] = 0.25

    #try:
    #    epochs = input("Number of generations (200): ")
    #except SyntaxError as e:
    if 'epochs' not in train_kwargs:
        train_kwargs['epochs'] = 100

    for k, v in train_kwargs.iteritems():
        print(str(k) + ": " + str(v))

    print('\n Job status:\n')

    count = 0
    all_counts = []
    all_jobs = {}

    tests = {}
    #trn_set = {}
    trn_idx = {}
    all_best = []
    all_best_com_val = []
    all_best_avg_trn = []
    all_best_avg_val = []
    all_best_design = []
    all_best_test = []

    #Lambda times
    for _time in xrange(cross_times):
        #Get an independant test set, 1/tau of the total.
        super_set, super_indices = get_cross_validation_sets(
            P, T, val_pieces, binary_column=1, return_indices=True)
        super_zip = zip(super_set, super_indices)

        all_best.append({})
        all_best_com_val.append({})
        all_best_avg_trn.append({})
        all_best_avg_val.append({})
        all_best_design.append({})
        all_best_test.append({})

        best = all_best[_time]
        best_com_val = all_best_com_val[_time]
        best_avg_trn = all_best_avg_trn[_time]
        best_avg_val = all_best_avg_val[_time]
        best_design = all_best_design[_time]
        best_test = all_best_test[_time]

        #For every blind test group
        for (((TRN, TEST), (TRN_IDX, TEST_IDX)),
             _t) in zip(super_zip, xrange(len(super_set))):
            TRN_INPUTS = TRN[0]
            TRN_TARGETS = TRN[1]
            TEST_INPUTS = TEST[0]
            TEST_TARGETS = TEST[1]

            #run each architecture design on a separate machine
            best[_t] = None
            best_com_val[_t] = 0
            best_avg_trn[_t] = 0
            best_avg_val[_t] = 0
            best_design[_t] = None
            best_test[_t] = None

            for design in designs:
                count += 1
                all_counts.append(count)

                (netsize, hidden_func) = design

                com = build_feedforward_committee(comsize,
                                                  len(P[0]),
                                                  netsize,
                                                  1,
                                                  hidden_function=hidden_func,
                                                  output_function='linear')

                tests[count] = (TEST_INPUTS, TEST_TARGETS)
                #trn_set[count] = (TRN_INPUTS, TRN_TARGETS)
                #print("TRN_IDX" + str(TRN_IDX))
                #print("TEST_IDX" + str(TEST_IDX))
                trn_idx[count] = TRN_IDX

                #1 is the column in the target array which holds the binary censoring information

                job = m.assemblejob((count, _time, _t, design),
                                    train_committee,
                                    com,
                                    train_evolutionary,
                                    TRN_INPUTS,
                                    TRN_TARGETS,
                                    binary_target=1,
                                    error_function=c_index_error,
                                    **train_kwargs)

                all_jobs[count] = job

                m.sendjob(job[0], job[1], *job[2], **job[3])

    while (count > 0):
        print('Remaining jobs: {0}'.format(all_counts))
        if fastest_done is None:
            ID, RESULT = m.getresult()  #Blocks
            fastest_done = time.time() - starting_time
        else:
            RETURNVALUE = m.get_waiting_result(2 * fastest_done)
            if RETURNVALUE is not None:
                ID, RESULT = RETURNVALUE
            else:
                print(
                    'Timed out after {0} seconds. Putting remaining jobs {1} back on the queue.\n \
                You should restart the server after this session.'.format(
                        fastest_done, all_counts))
                for _c in all_counts:
                    job = all_jobs[_c]
                    m.sendjob(job[0], job[1], *job[2], **job[3])
                continue  #Jump to next iteration

        print('Result received! Processing...')
        _c, _time, _t, design = ID

        (com, trn_errors, vald_errors, internal_sets,
         internal_sets_indices) = RESULT

        if _c not in all_counts:
            print('This result [{0}] has already been processed.'.format(_c))
            continue

        count -= 1

        TEST_INPUTS, TEST_TARGETS = tests[_c]
        #TRN_INPUTS, TRN_TARGETS = trn_set[_c]
        TRN_IDX = trn_idx[_c]

        all_counts.remove(_c)

        com.set_training_sets([
            _set[0][0] for _set in internal_sets
        ])  #first 0 gives training sets, second 0 gives inputs.

        #Now what we'd like to do is get the value for each patient in the
        #validation set, for all validation sets. Then I'd like to average the
        #result for each such patient, over the different validation sets.

        allpats = []
        allpats.extend(internal_sets[0][0][0])  #Extend with training inputs
        allpats.extend(internal_sets[0][1][0])  #Extend with validation inputs

        allpats_targets = []
        allpats_targets.extend(internal_sets[0][0][1])  #training targets
        allpats_targets.extend(internal_sets[0][1][1])  #validation targets
        allpats_targets = numpy.array(allpats_targets)

        patvals = [[] for bah in xrange(len(allpats))]

        #print(len(patvals))
        #print(len(internal_sets_indices))
        #1 for the validation set. Was given to the com.nets in the same type of iteration, so order is same
        # Will be order consistent with P and T
        for ((trn_in, trn_tar),
             (val_in, val_tar)), idx, net in zip(internal_sets,
                                                 internal_sets_indices,
                                                 com.nets):
            _C_ = -1
            for valpat in val_in:
                _C_ += 1
                i = TRN_IDX[idx[1][_C_]]
                pat = P[i]
                #print("Facit: \n" + str(valpat))
                #print("_C_ = " + str(_C_))
                #print("i: " + str(i))
                #print("P[TRN_IDX[i]] : " + str(pat))
                assert ((pat == valpat).all())
                patvals[i].append(com.risk_eval(pat, net=net))

        #Need  double brackets for dimensions to fit C-module
        avg_vals = numpy.array([[numpy.mean(patval)] for patval in patvals])
        #Now we have average validation ranks. do C-index on this
        avg_val_c_index = get_C_index(T, avg_vals)

        trn_errors = numpy.array(trn_errors.values(), dtype=numpy.float64)**-1
        vald_errors = numpy.array(vald_errors.values(),
                                  dtype=numpy.float64)**-1
        avg_trn = numpy.mean(trn_errors)
        avg_val = numpy.mean(vald_errors)

        best = all_best[_time]
        best_com_val = all_best_com_val[_time]
        best_avg_trn = all_best_avg_trn[_time]
        best_avg_val = all_best_avg_val[_time]
        best_design = all_best_design[_time]
        best_test = all_best_test[_time]

        if avg_val_c_index > best_com_val[_t]:
            best[_t] = com
            best_com_val[_t] = avg_val_c_index
            best_avg_trn[_t] = avg_trn
            best_avg_val[_t] = avg_val
            best_design[_t] = design
            best_test[_t] = tests[_c]

    print('\nWinning designs')
    winnerfilename = '.winningdesigns_{0:.0f}.csv'.format(time.time())
    with open(winnerfilename, 'w') as F:
        print(
            'Average Training Perf, Average Validation Perf, Average Committee Validation Perf, Test Perf, Design:'
        )
        F.write(
            'Average Training Perf, Average Validation Perf, Average Committee Validation Perf, Test Perf, Design\n'
        )
        for _time in xrange(len(all_best)):
            best = all_best[_time]
            best_com_val = all_best_com_val[_time]
            best_avg_trn = all_best_avg_trn[_time]
            best_avg_val = all_best_avg_val[_time]
            best_design = all_best_design[_time]
            best_test = all_best_test[_time]
            for _t in best.keys():
                TEST_INPUTS, TEST_TARGETS = best_test[_t]
                com = best[_t]

                if len(TEST_INPUTS) > 0:
                    #Need double brackets for dimensions to be right for numpy
                    outputs = numpy.array([[com.risk_eval(inputs)]
                                           for inputs in TEST_INPUTS])
                    test_c_index = get_C_index(TEST_TARGETS, outputs)
                elif Ptest is not None and Ttest is not None:
                    #Need double brackets for dimensions to be right for numpy
                    outputs = numpy.array([[com.risk_eval(inputs)]
                                           for inputs in Ptest])
                    test_c_index = get_C_index(Ttest, outputs)
                else:
                    test_c_index = 0

                print('{trn}, {val}, {com_val}, {test}, {dsn}'.format(
                    trn=best_avg_trn[_t],
                    val=best_avg_val[_t],
                    com_val=best_com_val[_t],
                    test=test_c_index,
                    dsn=best_design[_t]))
                F.write('{trn}, {val}, {com_val}, {test}, {dsn}\n'.format(
                    trn=best_avg_trn[_t],
                    val=best_avg_val[_t],
                    com_val=best_com_val[_t],
                    test=test_c_index,
                    dsn=best_design[_t]))

    return winnerfilename
def train_model(design, filename, columns, targets, comsize_third = 20, separator = '\t', **train_kwargs):
    '''
    train_model(design, filename, columns, targets)

    Given a design, will train a committee like that on the data specified. Will save the committee as
    '.design_time.pcom' where design is replaced by the design and time is replaced by a string of numbers from time()
    Returns this filename
    '''
    starting_time = time.time()
    fastest_done = None
    m = Master()

    #m.connect('gibson.thep.lu.se', 'science')
    m.connect('130.235.189.249', 'science')
    print('Connected to server')
    m.clear_queues()

    savefile = ".{nodes}_{a_func}_{time:.0f}.pcom".format(nodes = design[0], a_func = design[1], time = time.time())

    print('\nIncluding columns: ' + str(columns))
    print('Target columns: ' + str(targets))

    P, T = parse_file(filename, targetcols = targets, inputcols = columns, normalize = True, separator = separator, use_header = True)

    #columns = (2, -6, -5, -4, -3, -2, -1)
    #_P, T = parse_file(filename, targetcols = [4, 5], inputcols = (2, -4, -3, -2, -1), ignorerows = [0], normalize = True)
    #P, _T = parse_file(filename, targetcols = [4], inputcols = columns, ignorerows = [0], normalize = True)

    print("\nData set:")
    print("Number of patients with events: " + str(T[:, 1].sum()))
    print("Number of censored patients: " + str((1 - T[:, 1]).sum()))

    comsize = 3 * comsize_third #Make sure it is divisible by three (3*X will create X jobs)
    print('Number of members in the committee: ' + str(comsize))

    print('Design used (size, function): ' + str(design))

    #try:
    #    pop_size = input('Population size [50]: ')
    #except SyntaxError as e:
    if 'population_size' not in train_kwargs:
        train_kwargs['population_size'] = 200
    #print("Population size: " + str(train_kwargs['population_size']))

    #try:
    #    mutation_rate = input('Please input a mutation rate (0.25): ')
    #except SyntaxError as e:
    if 'mutation_chance' not in train_kwargs:
        train_kwargs['mutation_chance'] = 0.25
    #print("Mutation rate: " + str(train_kwargs['mutation_chance']))

    #try:
    #    epochs = input("Number of generations (200): ")
    #except SyntaxError as e:
    if 'epochs' not in train_kwargs:
        train_kwargs['epochs'] = 100

    for k, v in train_kwargs.iteritems():
        print(str(k) + ": " + str(v))

    #errorfunc = weighted_c_index_error
    errorfunc = c_index_error

    print("\nError function: " + errorfunc.__name__)

    print('\n Job status:\n')

    count = 0
    all_counts = []
    all_jobs = {}
    #trn_set = {}
    trn_idx = {}

    master_com = None

    allpats = P.copy()
    #allpats[:, 1] = 1 #This is the event column

    allpats_targets = T

    patvals = [[] for bah in xrange(len(allpats))]

    #Lambda times
    for _time in xrange(1):
        #Get an independant test set, 1/tau of the total.
        super_set, super_indices = get_cross_validation_sets(P, T, 1, binary_column = 1, return_indices = True)
        super_zip = zip(super_set, super_indices)
        #For every blind test group
        for (((TRN, TEST), (TRN_IDX, TEST_IDX)), _t) in zip(super_zip, xrange(len(super_set))):
            TRN_INPUTS = TRN[0]
            TRN_TARGETS = TRN[1]
            #TEST_INPUTS = TEST[0]
            #TEST_TARGETS = TEST[1]

            for com_num in xrange(comsize / 3):

                count += 1
                all_counts.append(count)

                #trn_set[count] = (TRN_INPUTS, TRN_TARGETS)
                trn_idx[count] = TRN_IDX

                (netsize, hidden_func) = design

                com = build_feedforward_committee(3, len(P[0]), netsize, 1, hidden_function = hidden_func, output_function = 'linear')

                #1 is the column in the target array which holds the binary censoring information

                job = m.assemblejob((count, _time, _t, design),
                    train_committee, com, train_evolutionary, TRN_INPUTS,
                    TRN_TARGETS, binary_target = 1, error_function = errorfunc,
                    **train_kwargs)

                all_jobs[count] = job

                m.sendjob(job[0], job[1], *job[2], **job[3])

    #TIME TO RECEIVE THE RESULTS
    while(count > 0):
        print('Remaining jobs: {0}'.format(all_counts))
        if fastest_done is None:
            ID, RESULT = m.getresult() #Blocks
            fastest_done = time.time() - starting_time
        else:
            RETURNVALUE = m.get_waiting_result(2 * fastest_done)
            if RETURNVALUE is not None:
                ID, RESULT = RETURNVALUE
            else:
                print('Timed out after {0} seconds. Putting remaining jobs {1} back on the queue.\nYou should restart \
                the server after this session.'.format(fastest_done, all_counts))
                for _c in all_counts:
                    job = all_jobs[_c]
                    m.sendjob(job[0], job[1], *job[2], **job[3])
                continue #Jump to next iteration

        print('Result received! Processing...')
        _c, _time, _t, design = ID

        (com, trn_errors, vald_errors, internal_sets, internal_sets_indices) = RESULT

        if _c not in all_counts:
            print('This result [{0}] has already been processed.'.format(_c))
            continue

        count -= 1

        #TRN_INPUTS, TRN_TARGETS = trn_set[_c]
        TRN_IDX = trn_idx[_c]

        all_counts.remove(_c)

        com.set_training_sets([_set[0][0] for _set in internal_sets]) #first 0 gives training sets, second 0 gives inputs.

        if master_com is None:
            master_com = com
        else:
            master_com.nets.extend(com.nets) #Add this batch of networks

        #Now what we'd like to do is get the value for each patient in the
        #validation set, for all validation sets. Then I'd like to average the
        #result for each such patient, over the different validation sets.



        #1 for the validation set. Was given to the com.nets in the same type of iteration, so order is same
        # patvals will be order-consistent with P and T
        #for (_trn_set_indices, val_set_indices), net in zip(internal_sets_indices, com.nets):
        #    for i in val_set_indices:
        #        patvals_new[TRN_IDX[i]].append(com.risk_eval(P[TRN_IDX[i]], net = net))

        for ((trn_in, trn_tar), (val_in, val_tar)), idx, net in zip(internal_sets, internal_sets_indices, com.nets):
            _C_ = -1
            for valpat in val_in:
                _C_ += 1
                i = TRN_IDX[idx[1][_C_]]
                pat = P[i]
                #print("Facit: \n" + str(valpat))
                #print("_C_ = " + str(_C_))
                #print("i: " + str(i))
                #print("P[TRN_IDX[i]] : " + str(pat))
                assert((pat == valpat).all())
                patvals[i].append(com.risk_eval(pat, net = net))

        #for pat, i in zip(allpats, xrange(len(patvals))):
            #We could speed this up by only reading every third dataset, but I'm not sure if they are ordered correctly...
        #    for ((trn_in, trn_tar), (val_in, val_tar)), idx, net in zip(internal_sets, internal_sets_indices, com.nets):
        #        _C_ = -1
        #        for valpat in val_in:
        #            _C_ += 1
        #            if (pat == valpat).all(): #Checks each variable individually, all() does a boolean and between the results
                        #print("Facit: \n" + str(valpat))
                        #print("Allpats-index = " + str(i))
                        #print("_C_ = " + str(_C_))
                        #print("idx_val[_C_]: " + str(idx[1][_C_]))
                        #print("TRN_IDX[i]: " + str(TRN_IDX[idx[1][_C_]]))
                        #print("P[TRN_IDX[i]] : " + str(P[TRN_IDX[idx[1][_C_]]]))
        #                patvals[i].append(com.risk_eval(pat, net = net)) #Just to have something to count
        #                break #Done with this data_set

        avg_vals = numpy.array([[numpy.mean(patval)] for patval in patvals]) #Need  double brackets for dimensions to fit C-module
        #Now we have average validation ranks. do C-index on this
        avg_val_c_index = get_C_index(allpats_targets, avg_vals)
        print('Average com-validation C-Index so far      : {0}'.format(avg_val_c_index))
        print('Saving committee so far in {0}'.format(savefile))
        with open(savefile, 'w') as FILE:
            pickle.dump(master_com, FILE)

    return savefile
def train_model(filename, columns, targets, separator='\t', comsize=1):
    '''
    train_model(design, filename, columns, targets)

    Given a design, will train a committee like that on the data specified. Will save the committee as
    '.design_time.pcom' where design is replaced by the design and time is replaced by a string of numbers from time()
    Returns this filename
    '''
    headers = []
    headers.extend(columns)
    headers.extend(targets)  #Add targets to the end

    targetcol = targets[0]
    eventcol = targets[1]

    savefile = ".cox_{time:.0f}.pcom".format(time=time.time())

    print('\nIncluding columns: ' + str(columns))
    print('Target columns: ' + str(targets))

    P, T = parse_file(filename,
                      targetcols=targets,
                      inputcols=columns,
                      normalize=False,
                      separator=separator,
                      use_header=True)

    #columns = (2, -6, -5, -4, -3, -2, -1)
    #_P, T = parse_file(filename, targetcols = [4, 5], inputcols = (2, -4, -3, -2, -1), ignorerows = [0], normalize = True)
    #P, _T = parse_file(filename, targetcols = [4], inputcols = columns, ignorerows = [0], normalize = True)

    print("\nData set:")
    print("Number of patients with events: " + str(T[:, 1].sum()))
    print("Number of censored patients: " + str((1 - T[:, 1]).sum()))

    print('Number of members in the committee: ' + str(comsize))

    allpats = P.copy()
    #allpats[:, 1] = 1 #This is the event column

    allpats_targets = T

    patvals = [[] for bah in xrange(len(allpats))]

    cox_committee = None

    #Get an independant test set, 1/tau of the total.
    super_set = get_cross_validation_sets(P, T, 1, binary_column=1)

    #For every blind test group
    for ((TRN, TEST), _t) in zip(super_set, xrange(len(super_set))):
        TRN_INPUTS = TRN[0]
        TRN_TARGETS = TRN[1]
        #TEST_INPUTS = TEST[0]
        #TEST_TARGETS = TEST[1]

        #Modulo expressions mean we can deal with any number of committees, not only multiples of three
        _res = 1 if comsize == 1 else 0
        for com_num in xrange(
                int(comsize / 3) + int((comsize % 3) / 2) + _res):
            #Every time in the loop, create new validations sets of size 1/3. 3 everytime
            _tmp_val_sets = get_cross_validation_sets(TRN_INPUTS,
                                                      TRN_TARGETS,
                                                      3,
                                                      binary_column=1)
            val_sets = []
            if int(comsize / 3) > 0:
                _max = 3
            else:
                _max = int((comsize % 3) / 2) * 2 + _res
            for _tmp_val_set in _tmp_val_sets[:_max]:
                ((trn_in, trn_tar), (val_in, val_tar)) = _tmp_val_set
                #Add target columns to the end
                _trn = np.append(trn_in, trn_tar, axis=1)
                _val = np.append(val_in, val_tar, axis=1)
                val_sets.append((_trn, _val))

            #And create 3 cox models, one for each validation
            tmp_com = committee(val_sets, targetcol, eventcol, headers)
            print("Adding this many members: " + str(len(tmp_com)))
            if cox_committee is None:
                cox_committee = tmp_com
            else:
                #Extend the big committee
                cox_committee.members.extend(tmp_com.members)

    #Now what we'd like to do is get the value for each patient in the
    #validation set, for all validation sets. Then I'd like to average the
    #result for each such patient, over the different validation sets.
    print("Validating cox committee, this might take a little while...")
    _count = 0
    if len(cox_committee) < 3:
        allpats_targets = np.empty(
            (0, 2))  #All patients won't be in the target set in this case
    for pat, i in zip(allpats, xrange(len(patvals))):
        if _count % 50 == 0:
            print("{0} / {1}".format(_count, len(patvals)))
        _count += 1
        #We could speed this up by only reading every third dataset, but I'm not sure if they are ordered correctly...
        for cox in cox_committee.members:
            (_trn, _val) = cox.internal_set
            trn_in = _trn[:, :-2]  #Last two columns are targets
            val_in = _val[:, :-2]
            val_tar = _val[:, -2:]
            for valpat, valtar in zip(val_in, val_tar):
                if (pat == valpat).all(
                ):  #Checks each variable individually, all() does a boolean and between the results
                    patvals[i].append(cox_committee.risk_eval(
                        pat, cox=cox))  #Just to have something to count
                    if len(cox_committee) < 3:
                        allpats_targets = np.append(allpats_targets, [valtar],
                                                    axis=0)
                    #print cox_committee.risk_eval(pat, cox = cox)
                    break  #Done with this data_set

    avg_vals = []
    for patval in patvals:
        if len(patval) > 0:
            avg_vals.append([np.mean(patval)])
    avg_vals = np.array(avg_vals)
    #avg_vals = np.array([[np.mean(patval)] for patval in patvals]) #Need  double brackets for dimensions to fit C-module
    #Now we have average validation ranks. do C-index on this
    avg_val_c_index = get_C_index(allpats_targets, avg_vals)
    print('Average validation C-Index: {0}'.format(avg_val_c_index))
    print('Saving committee in {0}'.format(savefile))
    with open(savefile, 'w') as FILE:
        pickle.dump(cox_committee, FILE)

    return savefile
def test_model_arrays(savefile, filename, P, T, **kwargs):
    with open(savefile, "r") as FILE:
        master_com = pickle.load(FILE)

    print("Committee size: {0}".format(len(master_com)))

    output_file = "test_{0}_{1}.cvs".format(
        os.path.splitext(os.path.basename(savefile))[0], os.path.splitext(os.path.basename(filename))[0]
    )
    # Need double brackets for dimensions to be right for numpy
    outputs = numpy.array([[master_com.risk_eval(inputs)] for inputs in P])
    if T is None or len(T) == 0:
        with open(output_file, "w") as F:
            # print('Targets\tOutputs\tEvents:')
            F.write("Outputs\n")
            for o in outputs:
                # print("{0}\t{1}\t{2}".format(t[0], o[0], t[1]))
                F.write("{0}\n".format(o[0]))
        return outputs

    c_index = get_C_index(T, outputs)

    print("C-Index: {0}".format(c_index))

    # if len(sys.argv) > 2:
    #    thresholds = [float(t) for t in sys.argv[2:]]
    # else:
    thresholds = None

    # Calculate suitable size for the figure for use in LaTEX
    fig_width_pt = 396.0  # Get this from LaTeX using \showthe\columnwidth
    inches_per_pt = 1.0 / 72.27  # Convert pt to inch
    golden_mean = (sqrt(5) - 1.0) / 2.0  # Aesthetic ratio
    fig_width = fig_width_pt * inches_per_pt  # width in inches
    fig_height = fig_width * golden_mean  # height in inches
    fig_size = [fig_width, fig_height]
    # Update settings
    plt.rcParams["figure.figsize"] = fig_size

    th = kaplanmeier(
        time_array=T[:, 0],
        event_array=T[:, 1],
        output_array=outputs,
        threshold=thresholds,
        show_plot=False,
        bestcut=False,
        **kwargs
    )
    # print("Threshold dividing the set in two equal pieces: " + str(th))
    if plt:
        plt.savefig(
            "kaplanmeier_{0}_{1}.eps".format(
                os.path.splitext(os.path.basename(savefile))[0], os.path.splitext(os.path.basename(filename))[0]
            )
        )

    with open(output_file, "w") as F:
        # print('Targets\tOutputs\tEvents:')
        F.write("Targets,Outputs,Events\n")
        for t, o in zip(T, outputs):
            # print("{0}\t{1}\t{2}".format(t[0], o[0], t[1]))
            F.write("{0},{1},{2}\n".format(t[0], o[0], t[1]))

    return output_file
def train_model(design,
                filename,
                columns,
                targets,
                comsize_third=20,
                separator='\t',
                **train_kwargs):
    '''
    train_model(design, filename, columns, targets)

    Given a design, will train a committee like that on the data specified. Will save the committee as
    '.design_time.pcom' where design is replaced by the design and time is replaced by a string of numbers from time()
    Returns this filename
    '''
    starting_time = time.time()
    fastest_done = None
    m = Master()

    #m.connect('gibson.thep.lu.se', 'science')
    m.connect('130.235.189.249', 'science')
    print('Connected to server')
    m.clear_queues()

    savefile = ".{nodes}_{a_func}_{time:.0f}.pcom".format(nodes=design[0],
                                                          a_func=design[1],
                                                          time=time.time())

    print('\nIncluding columns: ' + str(columns))
    print('Target columns: ' + str(targets))

    P, T = parse_file(filename,
                      targetcols=targets,
                      inputcols=columns,
                      normalize=True,
                      separator=separator,
                      use_header=True)

    #columns = (2, -6, -5, -4, -3, -2, -1)
    #_P, T = parse_file(filename, targetcols = [4, 5], inputcols = (2, -4, -3, -2, -1), ignorerows = [0], normalize = True)
    #P, _T = parse_file(filename, targetcols = [4], inputcols = columns, ignorerows = [0], normalize = True)

    print("\nData set:")
    print("Number of patients with events: " + str(T[:, 1].sum()))
    print("Number of censored patients: " + str((1 - T[:, 1]).sum()))

    comsize = 3 * comsize_third  #Make sure it is divisible by three (3*X will create X jobs)
    print('Number of members in the committee: ' + str(comsize))

    print('Design used (size, function): ' + str(design))

    #try:
    #    pop_size = input('Population size [50]: ')
    #except SyntaxError as e:
    if 'population_size' not in train_kwargs:
        train_kwargs['population_size'] = 200
    #print("Population size: " + str(train_kwargs['population_size']))

    #try:
    #    mutation_rate = input('Please input a mutation rate (0.25): ')
    #except SyntaxError as e:
    if 'mutation_chance' not in train_kwargs:
        train_kwargs['mutation_chance'] = 0.25
    #print("Mutation rate: " + str(train_kwargs['mutation_chance']))

    #try:
    #    epochs = input("Number of generations (200): ")
    #except SyntaxError as e:
    if 'epochs' not in train_kwargs:
        train_kwargs['epochs'] = 100

    for k, v in train_kwargs.iteritems():
        print(str(k) + ": " + str(v))

    #errorfunc = weighted_c_index_error
    errorfunc = c_index_error

    print("\nError function: " + errorfunc.__name__)

    print('\n Job status:\n')

    count = 0
    all_counts = []
    all_jobs = {}
    #trn_set = {}
    trn_idx = {}

    master_com = None

    allpats = P.copy()
    #allpats[:, 1] = 1 #This is the event column

    allpats_targets = T

    patvals = [[] for bah in xrange(len(allpats))]

    #Lambda times
    for _time in xrange(1):
        #Get an independant test set, 1/tau of the total.
        super_set, super_indices = get_cross_validation_sets(
            P, T, 1, binary_column=1, return_indices=True)
        super_zip = zip(super_set, super_indices)
        #For every blind test group
        for (((TRN, TEST), (TRN_IDX, TEST_IDX)),
             _t) in zip(super_zip, xrange(len(super_set))):
            TRN_INPUTS = TRN[0]
            TRN_TARGETS = TRN[1]
            #TEST_INPUTS = TEST[0]
            #TEST_TARGETS = TEST[1]

            for com_num in xrange(comsize / 3):

                count += 1
                all_counts.append(count)

                #trn_set[count] = (TRN_INPUTS, TRN_TARGETS)
                trn_idx[count] = TRN_IDX

                (netsize, hidden_func) = design

                com = build_feedforward_committee(3,
                                                  len(P[0]),
                                                  netsize,
                                                  1,
                                                  hidden_function=hidden_func,
                                                  output_function='linear')

                #1 is the column in the target array which holds the binary censoring information

                job = m.assemblejob((count, _time, _t, design),
                                    train_committee,
                                    com,
                                    train_evolutionary,
                                    TRN_INPUTS,
                                    TRN_TARGETS,
                                    binary_target=1,
                                    error_function=errorfunc,
                                    **train_kwargs)

                all_jobs[count] = job

                m.sendjob(job[0], job[1], *job[2], **job[3])

    #TIME TO RECEIVE THE RESULTS
    while (count > 0):
        print('Remaining jobs: {0}'.format(all_counts))
        if fastest_done is None:
            ID, RESULT = m.getresult()  #Blocks
            fastest_done = time.time() - starting_time
        else:
            RETURNVALUE = m.get_waiting_result(2 * fastest_done)
            if RETURNVALUE is not None:
                ID, RESULT = RETURNVALUE
            else:
                print(
                    'Timed out after {0} seconds. Putting remaining jobs {1} back on the queue.\nYou should restart \
                the server after this session.'.format(fastest_done,
                                                       all_counts))
                for _c in all_counts:
                    job = all_jobs[_c]
                    m.sendjob(job[0], job[1], *job[2], **job[3])
                continue  #Jump to next iteration

        print('Result received! Processing...')
        _c, _time, _t, design = ID

        (com, trn_errors, vald_errors, internal_sets,
         internal_sets_indices) = RESULT

        if _c not in all_counts:
            print('This result [{0}] has already been processed.'.format(_c))
            continue

        count -= 1

        #TRN_INPUTS, TRN_TARGETS = trn_set[_c]
        TRN_IDX = trn_idx[_c]

        all_counts.remove(_c)

        com.set_training_sets([
            _set[0][0] for _set in internal_sets
        ])  #first 0 gives training sets, second 0 gives inputs.

        if master_com is None:
            master_com = com
        else:
            master_com.nets.extend(com.nets)  #Add this batch of networks

        #Now what we'd like to do is get the value for each patient in the
        #validation set, for all validation sets. Then I'd like to average the
        #result for each such patient, over the different validation sets.

        #1 for the validation set. Was given to the com.nets in the same type of iteration, so order is same
        # patvals will be order-consistent with P and T
        #for (_trn_set_indices, val_set_indices), net in zip(internal_sets_indices, com.nets):
        #    for i in val_set_indices:
        #        patvals_new[TRN_IDX[i]].append(com.risk_eval(P[TRN_IDX[i]], net = net))

        for ((trn_in, trn_tar),
             (val_in, val_tar)), idx, net in zip(internal_sets,
                                                 internal_sets_indices,
                                                 com.nets):
            _C_ = -1
            for valpat in val_in:
                _C_ += 1
                i = TRN_IDX[idx[1][_C_]]
                pat = P[i]
                #print("Facit: \n" + str(valpat))
                #print("_C_ = " + str(_C_))
                #print("i: " + str(i))
                #print("P[TRN_IDX[i]] : " + str(pat))
                assert ((pat == valpat).all())
                patvals[i].append(com.risk_eval(pat, net=net))

        #for pat, i in zip(allpats, xrange(len(patvals))):
        #We could speed this up by only reading every third dataset, but I'm not sure if they are ordered correctly...
        #    for ((trn_in, trn_tar), (val_in, val_tar)), idx, net in zip(internal_sets, internal_sets_indices, com.nets):
        #        _C_ = -1
        #        for valpat in val_in:
        #            _C_ += 1
        #            if (pat == valpat).all(): #Checks each variable individually, all() does a boolean and between the results
        #print("Facit: \n" + str(valpat))
        #print("Allpats-index = " + str(i))
        #print("_C_ = " + str(_C_))
        #print("idx_val[_C_]: " + str(idx[1][_C_]))
        #print("TRN_IDX[i]: " + str(TRN_IDX[idx[1][_C_]]))
        #print("P[TRN_IDX[i]] : " + str(P[TRN_IDX[idx[1][_C_]]]))
        #                patvals[i].append(com.risk_eval(pat, net = net)) #Just to have something to count
        #                break #Done with this data_set

        avg_vals = numpy.array([
            [numpy.mean(patval)] for patval in patvals
        ])  #Need  double brackets for dimensions to fit C-module
        #Now we have average validation ranks. do C-index on this
        avg_val_c_index = get_C_index(allpats_targets, avg_vals)
        print('Average com-validation C-Index so far      : {0}'.format(
            avg_val_c_index))
        print('Saving committee so far in {0}'.format(savefile))
        with open(savefile, 'w') as FILE:
            pickle.dump(master_com, FILE)

    return savefile
def scatterplot_files(targetfile, targetcol, eventcol, modelfile, modeloutputcol, **kwargs):
    '''
    scatterplot_files(targetfile, targetcol, eventcol, modelfile, modeloutputcol)

    Takes two files because the target data and model data is allowed to be in different files.
    Events are ONLY taken from target data.
    Writes two files:
        scatter_cens_targetfile_modelfile.eps
        scatter_nocens_targetfile_modelfile.eps
    '''

    #Calculate suitable size for the figure for use in LaTEX
    fig_width_pt = 396.0  # Get this from LaTeX using \showthe\columnwidth
    inches_per_pt = 1.0/72.27               # Convert pt to inch
    golden_mean = (sqrt(5)-1.0)/2.0         # Aesthetic ratio
    fig_width = fig_width_pt*inches_per_pt  # width in inches
    fig_height = fig_width*golden_mean      # height in inches
    fig_size =  [fig_width,fig_height]
    #Update settings
    plt.rcParams['figure.figsize'] = fig_size
    #params = {'axes.labelsize': 10,
    #          'text.fontsize': 10,
    #          'legend.fontsize': 10,
    #          'xtick.labelsize': 8,
    #          'ytick.labelsize': 8,
              #'text.usetex': True,
    #          'figure.figsize': fig_size}
    #plt.rcParams.update(params)

#    with open(targetfile, 'r') as f:
#        X_in = [line.split() for line in f.readlines()]
#    X_in = numpy.array(X_in)
#    X = X_in[1:, first_col]
#    X = numpy.array(X, dtype = 'float')

    data = np.array(read_data_file(targetfile, ","))
    T, t = parse_data(data, inputcols = (targetcol, eventcol), ignorerows = [0], normalize = False)
    X = T[:, 0]
    events = T[:, 1]

#    with open(modeloutputcol, 'r') as f:
#        Y_in = [line.split() for line in f.readlines()]
#
#    Y_in = numpy.array(Y_in)
#    Y = Y_in[1:, second_col]
#    Y = numpy.array(Y, dtype = 'float')

    data = np.array(read_data_file(modelfile, ","))
    D, t = parse_data(data, inputcols = [modeloutputcol], ignorerows = [0], normalize = False)
    Y = D[:, 0]
#    if event_col is not None:
#        events = X_in[1:, event_col]
#        events = numpy.array(events, dtype = 'float')
#        print 'Using events'
#    else:
#        events = None

#    T = numpy.empty((len(X), 2), dtype='float')
#    T[:, 0] = X
#    T[:, 1] = events
    outputs = np.empty((len(X), 2), dtype='float')
    outputs[:, 0 ] = Y
    outputs[:, 1] = events
    c_index = get_C_index(T, outputs)
    print("C-Index between these files is: {0}".format(c_index))

    scatter(X, Y, events = events,
            x_label = 'Targets',
            y_label = 'Model output',
            gridsize = 30, mincnt = 0, show_plot = False)
    #plt.xlabel(os.path.basename(sys.argv[1]) + "\nC-Index between these files is: {0}".format(c_index))
    #plt.ylabel('Correlation of ' + os.path.basename(sys.argv[2]))

    plt.savefig('scatter_cens_cind_{cindex}_{0}_{1}.eps'.format(os.path.splitext(os.path.basename(modelfile))[0],
                                                                os.path.splitext(os.path.basename(targetfile))[0],
                                                                cindex=c_index))


    scatter(X, Y,
            x_label = 'Targets',
            y_label = 'Model output',
            gridsize = 30, mincnt = 0, show_plot = False)
    #plt.xlabel(os.path.basename(sys.argv[1]) + "\nC-Index between these files is: {0}".format(c_index))
    #plt.ylabel('Correlation of ' + os.path.basename(sys.argv[2]))

    plt.savefig('scatter_nocens_{cindex}_{0}_{1}.eps'.format(os.path.splitext(os.path.basename(modelfile))[0],
                                                             os.path.splitext(os.path.basename(targetfile))[0],
                                                             cindex=c_index))