def makeplots(tc, path, savepath, remove_structural: bool, nfolds, binarize_list, softmax_list, models_list, priors_list, **kwargs): _, _, _, _, _, label_encoder, _, _ = \ get_data_per_cell_type(single_cell_types=single_cell_types, remove_structural=remove_structural) target_classes = string2vec(tc, label_encoder) lrs_for_model_per_fold = OrderedDict() emtpy_numpy_array = np.zeros( (nfolds, len(binarize_list), len(softmax_list), len(models_list), len(priors_list))) accuracies_train, accuracies_test, accuracies_test_as_mixtures, accuracies_mixtures, accuracies_single, \ cllr_test, cllr_test_as_mixtures, cllr_mixtures, coeffs = [dict() for i in range(9)] for target_class in target_classes: target_class_str = vec2string(target_class, label_encoder) accuracies_train[target_class_str] = emtpy_numpy_array.copy() accuracies_test[target_class_str] = emtpy_numpy_array.copy() accuracies_test_as_mixtures[target_class_str] = emtpy_numpy_array.copy( ) accuracies_mixtures[target_class_str] = emtpy_numpy_array.copy() accuracies_single[target_class_str] = emtpy_numpy_array.copy() cllr_test[target_class_str] = emtpy_numpy_array.copy() cllr_test_as_mixtures[target_class_str] = emtpy_numpy_array.copy() cllr_mixtures[target_class_str] = emtpy_numpy_array.copy() coeffs[target_class_str] = np.zeros( (nfolds, len(binarize_list), 1, len(marker_names) - 4 + 1, len(priors_list))) for n in range(nfolds): lrs_for_model_per_fold[str(n)] = pickle.load( open(os.path.join(path, 'lrs_for_model_in_fold_{}'.format(n)), 'rb')) for target_class in target_classes: target_class_str = vec2string(target_class, label_encoder) target_class_save = target_class_str.replace(" ", "_") target_class_save = target_class_save.replace(".", "_") target_class_save = target_class_save.replace("/", "_") accuracies_train[target_class_str][n, :, :, :, :] = pickle.load( open( os.path.join( path, 'accuracies_train_{}_{}'.format(target_class_save, n)), 'rb')) accuracies_test[target_class_str][n, :, :, :, :] = pickle.load( open( os.path.join( path, 'accuracies_test_{}_{}'.format(target_class_save, n)), 'rb')) accuracies_test_as_mixtures[target_class_str][ n, :, :, :, :] = pickle.load( open( os.path.join( path, 'accuracies_test_as_mixt_{}_{}'.format( target_class_save, n)), 'rb')) accuracies_mixtures[target_class_str][n, :, :, :, :] = pickle.load( open( os.path.join( path, 'accuracies_mixt_{}_{}'.format(target_class_save, n)), 'rb')) accuracies_single[target_class_str][n, :, :, :, :] = pickle.load( open( os.path.join( path, 'accuracies_single_{}_{}'.format(target_class_save, n)), 'rb')) cllr_test[target_class_str][n, :, :, :, :] = pickle.load( open( os.path.join( path, 'cllr_test_{}_{}'.format(target_class_save, n)), 'rb')) cllr_test_as_mixtures[target_class_str][ n, :, :, :, :] = pickle.load( open( os.path.join( path, 'cllr_test_as_mixt_{}_{}'.format( target_class_save, n)), 'rb')) cllr_mixtures[target_class_str][n, :, :, :, :] = pickle.load( open( os.path.join( path, 'cllr_mixt_{}_{}'.format(target_class_save, n)), 'rb')) coeffs[target_class_str][n, :, :, :, :] = pickle.load( open( os.path.join(path, 'coeffs_{}_{}'.format(target_class_save, n)), 'rb')) types_data = ['test augm', 'mixt'] for type_data in types_data: lrs_before_for_all_methods, lrs_after_for_all_methods, y_nhot_for_all_methods = append_lrs_for_all_folds( lrs_for_model_per_fold, type=type_data) # plot_pavs_all_methods(lrs_before_for_all_methods, lrs_after_for_all_methods, y_nhot_for_all_methods, # target_classes, label_encoder, savefig=os.path.join(savepath, 'pav_{}'.format(type_data))) for kind in ['roc', 'histogram']: plot_property_all_lrs_all_folds(lrs_after_for_all_methods, y_nhot_for_all_methods, target_classes, label_encoder, kind=kind, savefig=os.path.join( savepath, f'{kind}_{type_data}')) lrs_before_for_all_methods, lrs_after_for_all_methods, \ y_nhot_for_all_methods = append_lrs_for_all_folds( lrs_for_model_per_fold, type='test augm') if len(priors_list) > 1: plot_scatterplots_all_lrs_different_priors( lrs_after_for_all_methods, y_nhot_for_all_methods, target_classes, label_encoder, savefig=os.path.join( savepath, 'LRs_for_different_priors_{}'.format(type_data))) if nfolds > 1: for t, target_class in enumerate(target_classes): target_class_str = vec2string(target_class, label_encoder) target_class_save = target_class_str.replace(" ", "_") target_class_save = target_class_save.replace(".", "_") target_class_save = target_class_save.replace("/", "_") plot_boxplot_of_metric( binarize_list, softmax_list, models_list, priors_list, cllr_test[target_class_str], label_encoder, "$C_{llr}$", savefig=os.path.join( savepath, 'boxplot_cllr_test_{}'.format(target_class_save))) plot_boxplot_of_metric( binarize_list, softmax_list, models_list, priors_list, cllr_mixtures[target_class_str], label_encoder, "$C_{llr}$", savefig=os.path.join( savepath, 'boxplot_cllr_mixtures_{}'.format(target_class_save))) if DEBUG: plot_boxplot_of_metric( binarize_list, softmax_list, models_list, priors_list, accuracies_train[target_class_str], label_encoder, 'accuracy', savefig=os.path.join( savepath, 'boxplot_accuracy_train_{}'.format(target_class_save))) plot_boxplot_of_metric( binarize_list, softmax_list, models_list, priors_list, accuracies_test[target_class_str], label_encoder, "accuracy", savefig=os.path.join( savepath, 'boxplot_accuracy_test_{}'.format(target_class_save))) plot_boxplot_of_metric( binarize_list, softmax_list, models_list, priors_list, cllr_test_as_mixtures[target_class_str], label_encoder, "$C_{llr}$", savefig=os.path.join( savepath, 'boxplot_cllr_test_as_mixt_{}'.format( target_class_save))) plot_progress_of_metric( binarize_list, softmax_list, models_list, priors_list, accuracies_train[target_class_str], label_encoder, 'accuracy', savefig=os.path.join( savepath, 'progress_accuracy_train_{}'.format( target_class_save))) plot_progress_of_metric( binarize_list, softmax_list, models_list, priors_list, accuracies_test[target_class_str], label_encoder, 'accuracy', savefig=os.path.join( savepath, 'progress_accuracy_test_{}'.format(target_class_save))) plot_progress_of_metric( binarize_list, softmax_list, models_list, priors_list, cllr_test[target_class_str], label_encoder, '$C_{llr}$', savefig=os.path.join( savepath, 'progress_cllr_test_{}'.format(target_class_save))) plot_boxplot_of_metric( binarize_list, [False], [[a, True] for a in ['intercept'] + marker_names], priors_list, coeffs[target_class_str], label_encoder, "log LR", savefig=os.path.join( savepath, 'boxplot_coefficients_{}'.format(target_class_save)), ylim=[-3, 3])
def get_final_trained_mlr_model( tc, single_cell_types, retrain, n_samples_per_combination, binarize=True, from_penile=False, prior=(1, 1, 1, 1, 1, 1, 1, 1), model_name='best_MLR', remove_structural=True, save_path=None, alternative_hypothesis=None, # blood, nasal, vaginal samples_to_evaluate=np.array([[1] * 3 + [0] + [1] * 5 + [0] * 6]), use_mixtures=True): """ computes or loads the MLR based on all data """ softmax = False mle = MultiLabelEncoder(len(single_cell_types)) X_single, y_nhot_single, n_celltypes, n_features, n_per_celltype, label_encoder, present_markers, present_celltypes = \ get_data_per_cell_type(single_cell_types=single_cell_types, remove_structural=True) y_single = mle.transform_single(mle.nhot_to_labels(y_nhot_single)) target_classes = string2vec(tc, label_encoder) save_data_table(X_single, [vec2string(y, label_encoder) for y in y_nhot_single], present_markers, os.path.join(save_path, 'single cell data.csv')) if retrain: model = clf_with_correct_settings('MLR', softmax=softmax, n_classes=-1, with_calibration=True) X_train, X_calib, y_train, y_calib = train_test_split( X_single, y_single, stratify=y_single, test_size=0.5) if use_mixtures: X_mixtures, y_nhot_mixtures, mixture_label_encoder = read_mixture_data( n_celltypes, label_encoder, binarize=binarize, remove_structural=remove_structural) save_data_table(X_mixtures, [ vec2string(y, label_encoder).replace(' and/or ', '+') for y in y_nhot_mixtures ], present_markers, os.path.join(save_path, 'mixture data.csv')) augmented_data = augment_splitted_data(X_train, y_train, X_calib, y_calib, None, None, None, n_celltypes, n_features, label_encoder, prior, [binarize], from_penile, [n_samples_per_combination] * 3, disallowed_mixtures=None) indices = [ np.argwhere(target_classes[i, :] == 1).flatten().tolist() for i in range(target_classes.shape[0]) ] y_train = np.array([ np.max(np.array(augmented_data.y_train_nhot_augmented[:, indices[i]]), axis=1) for i in range(len(indices)) ]).T # y_calib = np.array([np.max(np.array(augmented_data.y_calib_nhot_augmented[:, indices[i]]), axis=1) for i in range(len(indices))]).T model.fit_classifier(augmented_data.X_train_augmented, y_train) model.fit_calibration(augmented_data.X_calib_augmented, augmented_data.y_calib_nhot_augmented, target_classes) pickle.dump( model, open('{}'.format(os.path.join(save_path, model_name)), 'wb')) else: model = pickle.load( open('{}'.format(os.path.join(save_path, model_name)), 'rb')) if alternative_hypothesis: # also plot LRs of our hypothesis pairs against LRs when H2 is more specific implied_target = string2vec( ['Vaginal.mucosa and/or Menstrual.secretion'], label_encoder) alternative_target = string2vec(alternative_hypothesis, label_encoder) # at least one of H1 or alternative should be present, disallow absence of all: disallowed_mixtures = (-implied_target - alternative_target).astype( np.int) X_train, X_calib, y_train, y_calib = train_test_split( X_single, y_single, stratify=y_single, test_size=0.5) X_mixtures, y_nhot_mixtures, mixture_label_encoder = read_mixture_data( n_celltypes, label_encoder, binarize=binarize, remove_structural=remove_structural) augmented_data = augment_splitted_data( X_train, y_train, X_calib, y_calib, None, None, y_nhot_mixtures, n_celltypes, n_features, label_encoder, prior, [binarize], from_penile, [n_samples_per_combination] * 3, disallowed_mixtures=disallowed_mixtures) indices = [ np.argwhere(target_classes[i, :] == 1).flatten().tolist() for i in range(target_classes.shape[0]) ] y_train = np.array([ np.max(np.array(augmented_data.y_train_nhot_augmented[:, indices[i]]), axis=1) for i in range(len(indices)) ]).T specific_model = clf_with_correct_settings('MLR', softmax=False, n_classes=-1, with_calibration=True) specific_model.fit_classifier(augmented_data.X_train_augmented, y_train) specific_model.fit_calibration(augmented_data.X_calib_augmented, augmented_data.y_calib_nhot_augmented, target_classes) log_lrs = [] specific_log_lrs = [] for sample in samples_to_evaluate: log_lrs.append( np.log10(model.predict_lrs([sample], target_classes))[0][-1]) specific_log_lrs.append( np.log10(specific_model.predict_lrs([sample], target_classes))[0][-1]) plot_multiclass_comparison(specific_log_lrs, log_lrs, [ 'blood+nas+vag', 'menstr', 'indication of menstr', 'blood', 's***n' ], 'specific_hypothesis', save_path, x_label='log(LR)', y_label='log(LR) H2: blood') compare_to_multiclass(X_single, y_single, target_classes, tc, model, samples_to_evaluate, save_path=save_path, alternative_target=None) # plot the coefficients plot_coefficient_importances(model, target_classes, present_markers, label_encoder, savefig=os.path.join( save_path, 'coefs_{}_{}'.format(prior, model_name)), show=None) for t in range(len(target_classes)): intercept, coefficients = model.get_coefficients( t, target_classes[t].squeeze()) all_coefficients = np.append(intercept, coefficients).tolist() all_coefficients_str = [str(coef) for coef in all_coefficients] all_coefficients_strr = [ coef.replace('.', ',') for coef in all_coefficients_str ] present_markers.insert(0, 'intercept') with open(os.path.join( save_path, 'coefs_{}_{}.csv'.format(tc[t].replace('/', '_'), model_name)), mode='w') as coefs: coefs_writer = csv.writer(coefs, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL) coefs_writer.writerow(present_markers) coefs_writer.writerow(all_coefficients_strr)
def test_augment_data(): """ Tests that for given priors, the time that a cell type occurs is the same as the prior infers. """ from_penile = False mle = MultiLabelEncoder(len(single_cell_types)) tc = ['Skin', 'Vaginal.mucosa and/or Menstrual.secretion'] X_single, y_nhot_single, n_celltypes, n_features, n_per_celltype, label_encoder, present_markers, present_celltypes = \ get_data_per_cell_type(filename='../Datasets/Dataset_NFI_rv.xlsx', single_cell_types=single_cell_types, remove_structural=True) y_single = mle.transform_single(mle.nhot_to_labels(y_nhot_single)) target_classes = string2vec(tc, label_encoder) N_SAMPLES_PER_COMBINATION = [11, 22, 33] priors = [ [1, 1, 1, 1, 1, 1, 1, 1], # uniform priors [10, 1, 1, 1, 1, 1, 1, 1], # cell type 1 occurs 10 times more often [1, 10, 10, 10, 10, 10, 10, 10], ] # cell type 1 occurs 10 times less often for N_SAMPLES in N_SAMPLES_PER_COMBINATION: print(N_SAMPLES) for prior in priors: print(prior) X_augmented, y_nhot = augment_data(X_single, y_single, n_celltypes, n_features, N_SAMPLES, label_encoder, prior, binarize=True, from_penile=from_penile) occurrence_celltypes = np.sum(y_nhot, axis=0) if len(np.unique(prior)) == 1 or prior is None: assert all(occurrence == occurrence_celltypes.tolist()[0] for occurrence in occurrence_celltypes.tolist()) else: counts = { prior.count(value): value for value in list(set(prior)) } relevant_prior = counts[1] counts.pop(1) value_other_priors = list(counts.values())[0] index_of_relevant_prior = prior.index(relevant_prior) occurrence_of_relevant_prior = occurrence_celltypes[ index_of_relevant_prior] relative_occurrence_of_relevant_celltype = float( occurrence_of_relevant_prior / y_nhot.shape[0]) relative_occurrence_without_celltype = float( (y_nhot.shape[0] - occurrence_of_relevant_prior) / y_nhot.shape[0]) if relevant_prior != 1: assert round(relative_occurrence_of_relevant_celltype, 5) == \ round(relative_occurrence_without_celltype * relevant_prior, 5) else: assert round(relative_occurrence_of_relevant_celltype * value_other_priors, 5) == \ round(relative_occurrence_without_celltype, 5)
def nfold_analysis(nfolds, tc, savepath, from_penile: bool, models_list, softmax_list: List[bool], priors_list: List[List], binarize_list: List[bool], test_size: float, calibration_size: float, remove_structural: bool, calibration_on_loglrs: bool, nsamples: Tuple[int, int, int]): mle = MultiLabelEncoder(len(single_cell_types)) baseline_prior = str(priors_list[0]) # ======= Load data ======= X_single, y_nhot_single, n_celltypes, n_features, n_per_celltype, label_encoder, present_markers, present_celltypes = \ get_data_per_cell_type(single_cell_types=single_cell_types, remove_structural=remove_structural) y_single = mle.transform_single(mle.nhot_to_labels(y_nhot_single)) target_classes = string2vec(tc, label_encoder) outer = tqdm(total=nfolds, desc='{} folds'.format(nfolds), position=0, leave=False) for n in range(nfolds): # n = n + (nfolds * run) print(n) # ======= Initialize ======= lrs_for_model_in_fold = OrderedDict() emtpy_numpy_array = np.zeros((len(binarize_list), len(softmax_list), len(models_list), len(priors_list))) accuracies_train_n, accuracies_test_n, accuracies_test_as_mixtures_n, accuracies_mixtures_n, accuracies_single_n,\ cllr_test_n, cllr_test_as_mixtures_n, cllr_mixtures_n, coeffs = [dict() for i in range(9)] for target_class in target_classes: target_class_str = vec2string(target_class, label_encoder) accuracies_train_n[target_class_str] = emtpy_numpy_array.copy() accuracies_test_n[target_class_str] = emtpy_numpy_array.copy() accuracies_test_as_mixtures_n[ target_class_str] = emtpy_numpy_array.copy() accuracies_mixtures_n[target_class_str] = emtpy_numpy_array.copy() accuracies_single_n[target_class_str] = emtpy_numpy_array.copy() cllr_test_n[target_class_str] = emtpy_numpy_array.copy() cllr_test_as_mixtures_n[target_class_str] = emtpy_numpy_array.copy( ) cllr_mixtures_n[target_class_str] = emtpy_numpy_array.copy() coeffs[target_class_str] = np.zeros( (len(binarize_list), 1, X_single[0].shape[1] + 1, len(priors_list))) # ======= Split data ======= X_train, X_test, y_train, y_test = train_test_split( X_single, y_single, stratify=y_single, test_size=test_size) X_train, X_calib, y_train, y_calib = train_test_split( X_train, y_train, stratify=y_train, test_size=calibration_size) for i, binarize in enumerate(binarize_list): X_mixtures, y_nhot_mixtures, mixture_label_encoder = read_mixture_data( n_celltypes, label_encoder, binarize=binarize, remove_structural=remove_structural) # ======= Augment data for all priors ======= augmented_data = OrderedDict() for p, priors in enumerate(priors_list): augmented_data[str(priors)] = augment_splitted_data( X_train, y_train, X_calib, y_calib, X_test, y_test, y_nhot_mixtures, n_celltypes, n_features, label_encoder, priors, binarize_list, from_penile, nsamples, disallowed_mixtures=None) # ======= Transform data accordingly ======= if binarize: X_test_transformed = [[ np.where(X_test[i][j] > 150, 1, 0) for j in range(len(X_test[i])) ] for i in range(len(X_test))] X_test_transformed = combine_samples(X_test_transformed) else: X_test_transformed = combine_samples(X_test) / 1000 for j, softmax in enumerate(softmax_list): for k, model_calib in enumerate(models_list): print(model_calib[0]) # ======= Calculate LRs before and after calibration ======= key_name = bool2str_binarize( binarize) + '_' + bool2str_softmax( softmax) + '_' + str(model_calib) if not model_calib[1]: key_name += '_uncal' key_name_per_fold = str(n) + '_' + key_name model, lrs_before_calib, lrs_after_calib, y_test_nhot_augmented, \ lrs_before_calib_test_as_mixtures, lrs_after_calib_test_as_mixtures, y_test_as_mixtures_nhot_augmented, \ lrs_before_calib_mixt, lrs_after_calib_mixt = \ calculate_lrs_for_different_priors(augmented_data, X_mixtures, target_classes, baseline_prior, present_markers, model_calib, mle, label_encoder, key_name_per_fold, softmax, calibration_on_loglrs, savepath) lrs_for_model_in_fold[key_name] = LrsBeforeAfterCalib( lrs_before_calib, lrs_after_calib, y_test_nhot_augmented, lrs_before_calib_test_as_mixtures, lrs_after_calib_test_as_mixtures, y_test_as_mixtures_nhot_augmented, lrs_before_calib_mixt, lrs_after_calib_mixt, y_nhot_mixtures) ## Check which samples the method makes an error with # indices_values_above_one = np.argwhere(lrs_for_model_in_fold['[1, 1, 1, 1, 1, 1, 1, 1]'].lrs_before_calib > 1)[:, 0] # indices_values_below_one = np.argwhere(lrs_for_model_in_fold['[1, 1, 1, 1, 1, 1, 1, 1]'].lrs_before_calib < 1)[:, 0] # labels = np.max(np.multiply(augmented_data['[1, 1, 1, 1, 1, 1, 1, 1]'].y_test_nhot_augmented, target_class), axis=1) # indices_fp = np.argwhere(labels[indices_values_above_one] == 0) # indices_fn = np.argwhere(labels[indices_values_below_one] == 1) # augmented_data['[1, 1, 1, 1, 1, 1, 1, 1]'].y_test_nhot_augmented[indices_values_above_one][indices_fp][:, 0, :] # augmented_data['[1, 1, 1, 1, 1, 1, 1, 1]'].y_test_nhot_augmented[indices_values_below_one][indices_fn][:, 0, :] # ======= Calculate performance metrics ======= for t, target_class in enumerate(target_classes): for p, priors in enumerate(priors_list): str_prior = str(priors) target_class_str = vec2string( target_class, label_encoder) accuracies_train_n[target_class_str][ i, j, k, p] = calculate_accuracy_all_target_classes( augmented_data[str_prior]. X_train_augmented, augmented_data[str_prior]. y_train_nhot_augmented, target_classes, model[str_prior], mle)[t] accuracies_test_n[target_class_str][ i, j, k, p] = calculate_accuracy_all_target_classes( augmented_data[baseline_prior]. X_test_augmented, augmented_data[baseline_prior]. y_test_nhot_augmented, target_classes, model[str_prior], mle)[t] accuracies_test_as_mixtures_n[target_class_str][ i, j, k, p] = calculate_accuracy_all_target_classes( augmented_data[baseline_prior]. X_test_as_mixtures_augmented, augmented_data[baseline_prior]. y_test_as_mixtures_nhot_augmented, target_classes, model[str_prior], mle)[t] accuracies_mixtures_n[target_class_str][ i, j, k, p] = calculate_accuracy_all_target_classes( X_mixtures, y_nhot_mixtures, target_classes, model[str_prior], mle)[t] accuracies_single_n[target_class_str][ i, j, k, p] = calculate_accuracy_all_target_classes( X_test_transformed, mle.inv_transform_single(y_test), target_classes, model[str_prior], mle)[t] cllr_test_n[target_class_str][i, j, k, p] = cllr( lrs_after_calib[str_prior][:, t], augmented_data[baseline_prior]. y_test_nhot_augmented, target_class) cllr_test_as_mixtures_n[target_class_str][ i, j, k, p] = cllr( lrs_after_calib_test_as_mixtures[str_prior] [:, t], augmented_data[baseline_prior]. y_test_as_mixtures_nhot_augmented, target_class) cllr_mixtures_n[target_class_str][ i, j, k, p] = cllr( lrs_after_calib_mixt[str_prior][:, t], y_nhot_mixtures, target_class) if model_calib[0] == 'MLR' and not softmax: # save coefficents intercept, coefficients = model[str( priors)].get_coefficients(t, target_class) coeffs[target_class_str][i, 0, 0, p] = intercept for i_coef, coef in enumerate(coefficients): coeffs[target_class_str][i, 0, i_coef + 1, p] = coef outer.update(1) # ======= Save lrs and performance metrics ======= pickle.dump( lrs_for_model_in_fold, open( os.path.join(savepath, 'picklesaves/lrs_for_model_in_fold_{}'.format(n)), 'wb')) for t, target_class in enumerate(target_classes): target_class_str = vec2string(target_class, label_encoder) target_class_save = target_class_str.replace(" ", "_") target_class_save = target_class_save.replace(".", "_") target_class_save = target_class_save.replace("/", "_") pickle.dump( accuracies_train_n[target_class_str], open( os.path.join( savepath, 'picklesaves/accuracies_train_{}_{}'.format( target_class_save, n)), 'wb')) pickle.dump( accuracies_test_n[target_class_str], open( os.path.join( savepath, 'picklesaves/accuracies_test_{}_{}'.format( target_class_save, n)), 'wb')) pickle.dump( accuracies_test_as_mixtures_n[target_class_str], open( os.path.join( savepath, 'picklesaves/accuracies_test_as_mixt_{}_{}'.format( target_class_save, n)), 'wb')) pickle.dump( accuracies_mixtures_n[target_class_str], open( os.path.join( savepath, 'picklesaves/accuracies_mixt_{}_{}'.format( target_class_save, n)), 'wb')) pickle.dump( accuracies_single_n[target_class_str], open( os.path.join( savepath, 'picklesaves/accuracies_single_{}_{}'.format( target_class_save, n)), 'wb')) pickle.dump( cllr_test_n[target_class_str], open( os.path.join( savepath, 'picklesaves/cllr_test_{}_{}'.format( target_class_save, n)), 'wb')) pickle.dump( cllr_test_as_mixtures_n[target_class_str], open( os.path.join( savepath, 'picklesaves/cllr_test_as_mixt_{}_{}'.format( target_class_save, n)), 'wb')) pickle.dump( cllr_mixtures_n[target_class_str], open( os.path.join( savepath, 'picklesaves/cllr_mixt_{}_{}'.format( target_class_save, n)), 'wb')) pickle.dump( coeffs[target_class_str], open( os.path.join( savepath, 'picklesaves/coeffs_{}_{}'.format( target_class_save, n)), 'wb'))
def analyse_data(self): # global master, self.tree, button_load model_filename, read_marker_names, names, X_single, n_celltypes_with_penile, \ n_features, n_per_celltype = self.load_data() X = combine_samples(X_single) print('data loaded, shape {}. {}'.format(X.shape, X[0, :])) n_single_cell_types = 8 test_data_grouped = [] predicted_proba_average = [] predicted_proba_4 = [] proba_final_top = [] proba_final_bottom = [] if read_marker_names != marker_names: messagebox.showinfo( "Warning", "'The marker labels are inconsistent with the trained model, please fix the labels. " "The correct labels are: {}. Found {}".format( marker_names, read_marker_names)) print( "'The marker labels are inconsistent with the trained model, please fix the labels. " "The correct labels are: {}. Found {}".format( marker_names, read_marker_names)) # Load the trained model and all classes present in the trained model. model = pickle.load(open(model_filename, 'rb')) priors_numerator = get_prior(string2index, self.top_variables) priors_denominator = get_prior(string2index, self.bottom_variables) # for now, target classes are all separate classes and vag muc+menstr secr. target_classes_str = list(single_cell_types) + [ 'Vaginal.mucosa and/or Menstrual.secretion' ] lrs = model.predict_lrs(X, string2vec(target_classes_str, string2index), priors_numerator=priors_numerator, priors_denominator=priors_denominator) print(lrs) # # # classes = pickle.load(open('classes.pkl', 'rb')) # # mixture_classes_in_single_cell_type = pickle.load(open('mixture_classes_in_single_cell_type', 'rb')) # prob_per_class = get_prob_per_class(X, mixture_classes_in_single_cell_type, model, max_lr=10) # # print(prob_per_class) # print(prob_per_class.shape) # # Predict the probabilities for the input data for every trained class. # predict_proba = model.predict_lrs(X) # # predict_proba = predict_proba.toarray() # # predicted_proba_4.append(predict_proba) # # predicted_proba_average.append(sum(predict_proba) / self.number_of_replicates) # # proba_list = [] # LR_prediction_list = [] # top_list = [] # bottom_list = [] # final_list = [] # # # all_cell_types = ['Blank_PCR', 'S***n.fertile', 'Saliva', 'Nasal.mucosa', 'Menstrual.secretion', 'Blood', # # 'S***n.sterile', 'Vaginal.mucosa', 'Skin', 'Skin.penile'] # # cell_types_yes_top = [self.single_cell_types[i] for i in # [i for i, x in enumerate(self.top_variables) if x == 'Always']] # cell_types_no_top = [self.single_cell_types[i] for i in # [i for i, x in enumerate(self.top_variables) if x == 'Never']] # # cell_types_yes_bottom = [self.single_cell_types[i] for i in # [i for i, x in enumerate(self.bottom_variables) if x == 'Always']] # cell_types_no_bottom = [self.single_cell_types[i] for i in # [i for i, x in enumerate(self.bottom_variables) if x == 'NEVER']] # # # TOP PART OF LR # for probabilility_4, probability_average in zip(predicted_proba_4, predicted_proba_average): # proba_all_top = [] # # Probability for 4 replicates # for probability_single in probabilility_4: # proba_per_class = [] # matches_yes_list = [] # matches_no_list = [] # if len(cell_types_yes_top) != 0: # for single_cell_type in cell_types_yes_top: # matches_yes = [i for i, s in enumerate(classes) if single_cell_type in s] # matches_yes_list.append(matches_yes) # flatten_yes = [item for sublist in matches_yes_list for item in sublist] # new_list_yes = sorted(set(flatten_yes)) # dup_list_yes = [] # for i in range(len(new_list_yes)): # if (flatten_yes.count(new_list_yes[i]) > len(cell_types_yes_top) - 1): # dup_list_yes.append(new_list_yes[i]) # # else: # for single_cell_type in self.single_cell_types: # matches_yes = [i for i, s in enumerate(classes) if single_cell_type in s] # matches_yes_list.append(matches_yes) # dup_list_yes = [item for sublist in matches_yes_list for item in sublist] # # for single_cell_type_no in cell_types_no_top: # matches_no = [i for i, s in enumerate(classes) if single_cell_type_no in s] # matches_no_list.append(matches_no) # flatten_no = [item for sublist in matches_no_list for item in sublist] # # difference_top_list = list(set(list(set(dup_list_yes))) - set(list(set(flatten_no)))) # # for class_index in difference_top_list: # proba_per_class.append(probability_single[class_index]) # proba_all_top.append(sum(proba_per_class)) # # # Probability for average of 4 replicates # proba_per_class = [] # matches_yes_list = [] # matches_no_list = [] # if len(cell_types_yes_top) != 0: # for single_cell_type in cell_types_yes_top: # matches_yes = [i for i, s in enumerate(classes) if single_cell_type in s] # matches_yes_list.append(matches_yes) # flatten_yes = [item for sublist in matches_yes_list for item in sublist] # new_list_yes = sorted(set(flatten_yes)) # dup_list_yes = [] # for i in range(len(new_list_yes)): # if (flatten_yes.count(new_list_yes[i]) > len(cell_types_yes_top) - 1): # dup_list_yes.append(new_list_yes[i]) # else: # for single_cell_type in self.single_cell_types: # matches_yes = [i for i, s in enumerate(classes) if single_cell_type in s] # matches_yes_list.append(matches_yes) # dup_list_yes = [item for sublist in matches_yes_list for item in sublist] # # for single_cell_type_no in cell_types_no_top: # matches_no = [i for i, s in enumerate(classes) if single_cell_type_no in s] # matches_no_list.append(matches_no) # flatten_no = [item for sublist in matches_no_list for item in sublist] # # difference_top_list = list(set(list(set(dup_list_yes))) - set(list(set(flatten_no)))) # # for class_index in difference_top_list: # proba_per_class.append(probability_average[class_index]) # proba_all_top.append(sum(proba_per_class)) # proba_final_top.append(proba_all_top) # # # BOTTOM PART OF LR # for probabilility_4, probability_average in zip(predicted_proba_4, predicted_proba_average): # proba_all_bottom = [] # # Probability for 4 replicates # for probability_single in probabilility_4: # proba_per_class = [] # matches_yes_list = [] # matches_no_list = [] # if len(cell_types_yes_bottom) != 0: # for single_cell_type in cell_types_yes_bottom: # matches_yes = [i for i, s in enumerate(classes) if single_cell_type in s] # matches_yes_list.append(matches_yes) # flatten_yes = [item for sublist in matches_yes_list for item in sublist] # new_list_yes = sorted(set(flatten_yes)) # dup_list_yes = [] # for i in range(len(new_list_yes)): # if (flatten_yes.count(new_list_yes[i]) > len(cell_types_yes_bottom) - 1): # dup_list_yes.append(new_list_yes[i]) # else: # for single_cell_type in self.single_cell_types: # matches_yes = [i for i, s in enumerate(classes) if single_cell_type in s] # matches_yes_list.append(matches_yes) # dup_list_yes = [item for sublist in matches_yes_list for item in sublist] # # for single_cell_type_no in cell_types_no_bottom: # matches_no = [i for i, s in enumerate(classes) if single_cell_type_no in s] # matches_no_list.append(matches_no) # flatten_no = [item for sublist in matches_no_list for item in sublist] # # difference_bottom_list = list(set(list(set(dup_list_yes))) - set(list(set(flatten_no)))) # # for class_index in difference_bottom_list: # proba_per_class.append(probability_single[class_index]) # proba_all_bottom.append(sum(proba_per_class)) # # # Probability for average of 4 replicates # proba_per_class = [] # matches_yes_list = [] # matches_no_list = [] # if len(cell_types_yes_bottom) != 0: # for single_cell_type in cell_types_yes_bottom: # matches_yes = [i for i, s in enumerate(classes) if single_cell_type in s] # matches_yes_list.append(matches_yes) # flatten_yes = [item for sublist in matches_yes_list for item in sublist] # new_list_yes = sorted(set(flatten_yes)) # dup_list_yes = [] # for i in range(len(new_list_yes)): # if (flatten_yes.count(new_list_yes[i]) > len(cell_types_yes_bottom) - 1): # dup_list_yes.append(new_list_yes[i]) # else: # for single_cell_type in self.single_cell_types: # matches_yes = [i for i, s in enumerate(classes) if single_cell_type in s] # matches_yes_list.append(matches_yes) # dup_list_yes = [item for sublist in matches_yes_list for item in sublist] # # for single_cell_type_no in cell_types_no_bottom: # matches_no = [i for i, s in enumerate(classes) if single_cell_type_no in s] # matches_no_list.append(matches_no) # flatten_no = [item for sublist in matches_no_list for item in sublist] # # difference_bottom_list = list(set(list(set(dup_list_yes))) - set(list(set(flatten_no)))) # # for class_index in difference_bottom_list: # proba_per_class.append(probability_average[class_index]) # proba_all_bottom.append(sum(proba_per_class)) # proba_final_bottom.append(proba_all_bottom) # # # Calculate the LR # for proba_one_top, proba_one_bottom in zip(proba_final_top, proba_final_bottom): # LR_list = [] # top_list_temp = [] # bottom_list_temp = [] # final_list_temp = [] # for prob_one_top, prob_one_bottom in zip(proba_one_top, proba_one_bottom): # top_list_temp.append(np.sum(prob_one_top)) # bottom_list_temp.append(np.sum(prob_one_bottom)) # LR_list.append(np.log10(np.sum(prob_one_top) / np.sum(prob_one_bottom))) # final_list_temp.append([np.sum(prob_one_top), np.sum(prob_one_bottom), # np.log10(np.sum(prob_one_top) / np.sum(prob_one_bottom))]) # # final_list.append(final_list_temp) # top_list.append(top_list_temp) # bottom_list.append(bottom_list_temp) # LR_prediction_list.append(LR_list) # # Create a window that shows the output table with the LR's # master = Tk() # app = FullScreenApp(master) # # frame = Frame(master) # frame.pack() # # neutral_list_top = [x for x in self.single_cell_types if x not in cell_types_yes_top] # neutral_list_top = [x for x in neutral_list_top if x not in cell_types_no_top] # # neutral_list_bottom = [x for x in self.single_cell_types if x not in cell_types_yes_bottom] # neutral_list_bottom = [x for x in neutral_list_bottom if x not in cell_types_no_bottom] # # # LR table # text = Text(frame, width=200, height=1) # text.insert('1.0', cell_types_yes_top) # text.insert('1.0', 'Top yes: ') # text.pack(side=TOP) # text1 = Text(frame, width=200, height=1) # text1.insert('1.0', cell_types_no_top) # text1.insert('1.0', 'Top no: ') # text1.pack(side=TOP) # text2 = Text(frame, width=200, height=1) # text2.insert('1.0', neutral_list_top) # text2.insert('1.0', 'Top neutral: ') # text2.pack(side=TOP) # text3 = Text(frame, width=200, height=1) # text3.insert('1.0', cell_types_yes_bottom) # text3.insert('1.0', 'Bottom yes: ') # text3.pack(side=TOP) # text4 = Text(frame, width=200, height=1) # text4.insert('1.0', cell_types_no_bottom) # text4.insert('1.0', 'Bottom no: ') # text4.pack(side=TOP) # text5 = Text(frame, width=200, height=1) # text5.insert('1.0', neutral_list_bottom) # text5.insert('1.0', 'Bottom neutral: ') # text5.pack(side=TOP) # # labels = ['Probability top', 'Probability bottom', 'Log(10) LR'] # labels_csv = ['Probability top', 'Probability bottom', 'Log(10) LR', 'Top yes', 'Top no', 'Top neutral', # 'Bottom no', 'Bottom yes', 'Bottom neutral'] # # number_columns = range(1, (len(labels) + 2)) # # self.tree = ttk.Treeview(frame, columns=number_columns, height=20, show="headings") # self.tree.pack(side=TOP) # # self.create_table(labels) # # i = 1 # j = 0 # values = [] # # temp_list_grouped = [] # for grouped_LR in final_list: # # temp_value = [] # for val in grouped_LR: # val = [round(v, 2) for v in val] # if i % (self.number_of_replicates + 1) == 0: # index = 'Average' # else: # index = names[j] # j = j + 1 # # values.append(index) # if i % (self.number_of_replicates + 1) == 0: # self.tree.insert('', 'end', values=( # index, val[0], val[1], val[2]), tags=('average',)) # else: # self.tree.insert('', 'end', values=( # index, val[0], val[1], val[2]), tags=('normal',)) # i = i + 1 # temp_value.append(val) # temp_list_grouped.append(temp_value) # # self.tree.tag_configure('average', background='lightblue') # # frames = [] # for LR_grouped in temp_list_grouped: # df = pd.DataFrame.from_records(LR_grouped, columns=labels) # frames.append(df) # # # Save the LR selection in a dataframe # d = {'Top_yes': [cell_types_yes_top], 'Top_no': [cell_types_no_top], 'Top_neutral': [neutral_list_top], # 'Bottom_yes': [cell_types_yes_bottom], 'Bottom_no': [cell_types_no_bottom], # 'Bottom_neutral': [neutral_list_bottom]} # df_LR_types = pd.DataFrame(data=d, columns=['Top_yes', 'Top_no', 'Top_neutral', 'Bottom_yes', 'Bottom_no', # 'Bottom_neutral']) # df_LR_types = df_LR_types.set_index('Top_yes') # # # Save LR results in a dataframe # result = pd.concat(frames) # result['Sample_name'] = values # result.set_index('Sample_name', inplace=True) # # # Save the results LR dataframe in a csv file. # try: # with open(self.save_filename + '.csv', 'w') as f: # result.to_csv(f) # with open(self.save_filename + '.csv', 'a') as f: # df_LR_types.to_csv(f) # except IOError: # sys.exit() # button_load = Button(master, command=self.restart_program, text="Restart", height=2, width=15) # button_load.pack(side=TOP) mainloop()