def analysis_results(options): """ Analyzes the results of the comparisons """ # Start marker for time measure start = time.time() print( "\n\t\t------------------------------------------------------------------------------------------------------------------------\n" ) print( "\t\tStarting Drug Interactions ANAlysis (DIANA), a program created by @OLIVA'S LAB. Analysis of results: Analysis by targets\n" ) print( "\t\t------------------------------------------------------------------------------------------------------------------------\n" ) # Get the script path main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) toolbox_dir = os.path.join(main_path, 'diana/toolbox') # Check the directory of the profiles, comparisons and analysis data_dir = os.path.join(options.workspace, "profiles") check_directory(data_dir) results_dir = os.path.join(options.workspace, "comparisons") check_directory(results_dir) analysis_dir = os.path.join(options.workspace, "analysis") check_directory(analysis_dir) # Get the list of thresholds to create the profiles if options.threshold_list and fileExist(options.threshold_list): threshold_list = get_values_from_threshold_file(options.threshold_list) else: threshold_list = [1, 5, 10, 20, 50] # Do we consider Side Effects/ATC? if options.consider_se: consider_se = True else: consider_se = False # Get the names of the columns columns = diana_analysis.obtain_columns(threshold_list, ATC_SE=consider_se) #-----------------------------------------------------# # PARSE THE RESULTS AND CREATE A PANDAS DATAFRAME # #-----------------------------------------------------# pair2comb_file = os.path.join(toolbox_dir, 'pair2comb.pcl') pair2comb = cPickle.load(open(pair2comb_file)) ddi = sum(1 for x in pair2comb.values() if x == 1) non_ddi = sum(1 for x in pair2comb.values() if x == 0) print('NUMBER OF DRUG COMBINATIONS:\t\t{}\n'.format(ddi)) print('NUMBER OF NON-DRUG COMBINATIONS:\t{}\n'.format(non_ddi)) output_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons.csv') if not fileExist(output_dataframe): # Create a data frame to store the results df = pd.DataFrame(columns=columns) # Obtain all the results subfolders of the results main folder results_dir_list = [ f for f in os.listdir(results_dir) if os.path.isdir(os.path.join(results_dir, f)) ] for comparison in results_dir_list: drug_id1, drug_id2 = comparison.split('---') comparison_dir = os.path.join(results_dir, comparison) results_table = os.path.join(comparison_dir, 'results_table.tsv') # Add the Comb field (if it is drug combination or not) drug1 = drug_id1.split('_')[0].upper() drug2 = drug_id2.split('_')[0].upper() comparison_without_id = '{}---{}'.format(drug1, drug2) if comparison_without_id in pair2comb: combination_field = pair2comb[comparison_without_id] else: print( 'The comparison {} is not in the pair2comb dictionary!\n'. format(comparison_without_id)) print(pair2comb) sys.exit(10) if not fileExist(results_table): print('The comparison {} has not been executed properly!\n'. format(comparison)) sys.exit(10) results = get_results_from_table(results_table, columns, combination_field) df2 = pd.DataFrame([results], columns=columns, index=[comparison]) # Add the information to the main data frame df = df.append(df2) # Output the Pandas dataframe in a CSV file df.to_csv(output_dataframe) else: df = pd.read_csv(output_dataframe, index_col=0) #---------------------------# # REMOVE MISSING VALUES # #---------------------------# # Replace the None values in dcstructure by nan if 'None' in df['dcstructure']: df = df.replace(to_replace={'dcstructure': {'None': np.nan}}) # Remove the nan values in dcstructure df = df.dropna() # Count the number of drug combinations / non-drug combinations dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print('Number of drug combinations after removing missing values:\t{}\n'. format(num_dc)) print( 'Number of non-drug combinations after removing missing values:\t{}\n'. format(num_ndc)) #---------------------------# # IDENTIFY ME-TOO DRUGS # #---------------------------# me_too_dir = os.path.join(analysis_dir, 'me_too_drugs') create_directory(me_too_dir) me_too_drugs_table = os.path.join(me_too_dir, 'me_too_drugs.tsv') me_too_drug_combs_table = os.path.join(me_too_dir, 'me_too_drug_combinations.tsv') me_too_drug_pairs_file = os.path.join(me_too_dir, 'me_too_drug_pairs.pcl') me_too_drug_comb_pairs_file = os.path.join(me_too_dir, 'me_too_drug_comb_pairs.pcl') if not fileExist(me_too_drug_pairs_file) or not fileExist( me_too_drug_comb_pairs_file): df_struc = df[['dcstructure']] df_struc = df_struc.astype(float) me_too_drug_pairs, me_too_drug_comb_pairs = diana_analysis.obtain_me_too_drugs_and_combinations( df_struc, columns, me_too_drugs_table, me_too_drug_combs_table) cPickle.dump(me_too_drug_pairs, open(me_too_drug_pairs_file, 'w')) cPickle.dump(me_too_drug_comb_pairs, open(me_too_drug_comb_pairs_file, 'w')) else: me_too_drug_pairs = cPickle.load(open(me_too_drug_pairs_file)) me_too_drug_comb_pairs = cPickle.load( open(me_too_drug_comb_pairs_file)) # Process me-too drug combination pairs me_too_drug_combinations = set() drug_pair_to_me_too_times = {} for pair in me_too_drug_comb_pairs: drug_comb1, drug_comb2 = pair.split('___') me_too_drug_combinations.add(frozenset([drug_comb1, drug_comb2])) drug_pair_to_me_too_times.setdefault(drug_comb1, 0) drug_pair_to_me_too_times.setdefault(drug_comb2, 0) drug_pair_to_me_too_times[drug_comb1] += 1 drug_pair_to_me_too_times[drug_comb2] += 1 removed_drug_pairs = set() for pair in me_too_drug_comb_pairs: drug_comb1, drug_comb2 = pair.split('___') if drug_comb1 in removed_drug_pairs or drug_comb2 in removed_drug_pairs: continue if drug_pair_to_me_too_times[drug_comb1] > drug_pair_to_me_too_times[ drug_comb2]: removed_drug_pairs.add(drug_comb1) else: removed_drug_pairs.add(drug_comb2) # Remove the drug pairs which appear in me-too pairs of drug pairs more times df = df.loc[~df.index.isin(list(removed_drug_pairs))] # Count the number of drug combinations / non-drug combinations dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print( 'Number of drug combinations after removing me-too conflictive drug pairs:\t{}\n' .format(num_dc)) print( 'Number of non-drug combinations after removing me-too conflictive drug pairs:\t{}\n' .format(num_ndc)) #-------------------------# # EVALUATE PERFORMANCE # #-------------------------# img_dir = os.path.join(analysis_dir, 'figures') create_directory(img_dir) fig_format = 'png' tables_dir = os.path.join(analysis_dir, 'tables') create_directory(tables_dir) # Machine learning parameters repetitions = 25 # Number of repetititons n_fold = 2 # Number of folds min_num_dc_group = 10 greater_or_smaller = 'greater' classifier = 'SVC' classifiers = { 'KNeighbors': KNeighborsClassifier(3), 'SVC': SVC(probability=True), 'SVC linear': SVC(kernel="linear", C=0.025), 'SVC rbf': SVC(gamma=2, C=1), 'DecisionTree': DecisionTreeClassifier(max_depth=5), 'RandomForest': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 'MLP': MLPClassifier(alpha=1), 'AdaBoost': AdaBoostClassifier(), 'GaussianNB': GaussianNB(), 'QuadraticDiscr.': QuadraticDiscriminantAnalysis(), 'SVC best 1': SVC(kernel="linear", C=0.1, probability=True), 'SVC best 2': SVC(kernel="rbf", gamma=0.01, C=100.0, probability=True) } # Plot of distributions of AUC plot_name = os.path.join(img_dir, 'dcGUILD_1_threshold_auc.{}'.format(fig_format)) # Get the targets file drugbank_to_targets_file = os.path.join(toolbox_dir, 'drugbank_to_targets.pcl') drugbank_to_targets = cPickle.load(open(drugbank_to_targets_file)) # Get the DIANA IDs file diana_id_to_drugbank_file = os.path.join(toolbox_dir, 'diana_id_to_drugbank.pcl') diana_id_to_drugbank = cPickle.load(open(diana_id_to_drugbank_file)) print('\nEVALUATION OF DCGUILD\n') repetitions = 25 n_fold = 10 analysis_results = {} # Obtain the different non-drug combination groups to repeat the analysis ndc_repetitions = diana_analysis.obtain_n_groups_of_k_length( ndc_data, repetitions, num_dc ) # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times # dcGUILD_features = [str(x) for x in threshold_list] # dcGUILD_feature_to_columns = {} # # Get dcGUILD columns # for top_threshold in threshold_list: # for data_type in ['node', 'edge', 'function']: # for scoring_function in ['dot_product', 'spearman', 'jaccard']: # col = 'dcg'+'_'+data_type+'_'+str(top_threshold)+'_'+scoring_function # dcGUILD_feature_to_columns.setdefault(str(top_threshold), []) # dcGUILD_feature_to_columns[str(top_threshold)].append(col) # dcGUILD_feature_to_columns[str(top_threshold)].append('combination') dcGUILD_features = [] dcGUILD_feature_to_columns = {} # Get dcGUILD columns for top_threshold in [1]: for data_type in ['node', 'edge', 'function']: for scoring_function in ['dot_product', 'spearman', 'jaccard']: col = 'dcg' + '_' + data_type + '_' + str( top_threshold) + '_' + scoring_function dcGUILD_features.append(col) dcGUILD_feature_to_columns[col] = [col, 'combination'] for feature in dcGUILD_features: df_method = df[dcGUILD_feature_to_columns[feature]] dc_data = df_method[df_method['combination'] == 1] ndc_data = df_method[df_method['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print(feature) print( 'Building {} repetition groups of {} (same) DC and {} (different) non-DC' .format(repetitions, num_dc, num_dc)) ndc_repetitions = diana_analysis.obtain_n_groups_of_k_length( ndc_data, repetitions, num_dc ) # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times mean_aucs = [ ] # Here we will store the means of AUCs from the cross-validations std_aucs = [ ] # Here we will store the standard deviations of the AUCs from the cross-validations all_aucs = [] # Here we will store ALL the AUCs all_probs = [] # Here we store all the probabilities and labels num_repetitions = 0 for ndc_data_equal in ndc_repetitions: num_repetitions += 1 num_items_group = int( float(num_dc) / float(n_fold) ) # Calculate the number of items in each group of the cross-validation if num_repetitions == 1: print( 'Building {} fold groups of {} DC and {} non-DC x {} repetitions' .format(n_fold, num_items_group, num_items_group, repetitions)) dc_groups = diana_analysis.obtain_n_groups_of_k_length( dc_data, n_fold, num_items_group, me_too_drug_combinations ) # Defining the drug combination groups in each cross-validation step ndc_groups = diana_analysis.obtain_n_groups_of_k_length( ndc_data_equal, n_fold, num_items_group, me_too_drug_combinations ) # Defining the non-drug combination groups in each cross-validation step merged_groups = [ pd.concat([x, y]) for x, y in zip(dc_groups, ndc_groups) ] mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_scikit_with_prob( n_fold, merged_groups, classifiers[classifier]) mean_aucs.append(mean) std_aucs.append(std) all_aucs = all_aucs + list_auc all_probs = all_probs + list_prob final_mean = np.mean(mean_aucs) mean_std = np.mean(std_aucs) std_means = np.std(mean_aucs) std = np.std(all_aucs) print('FINAL MEAN: {}'.format(final_mean)) print('MEAN of STD: {}'.format(mean_std)) print('STD: {}\n'.format(std)) # Store the distribution of AUCs in the dictionary analysis_results[feature] = all_aucs #------------------------------# # PLOT DISTRIBUTION OF AUC # #------------------------------# fig = pylab.figure(dpi=300) ax = pylab.axes() #pylab.hold(True) pos = 1 col_num = 0 xticks = [] # Define the places in which the labels will be xlabels = [] # Define the labels (the names of the features) #colors = [ ['#9ed0ff, blue'], ['#32f232', 'green'], ['#fbc562', '#d48900'], ['#ff7373', '#b80000'], ['grey', 'black'] ] for feature in dcGUILD_features: positions = [] positions.append(pos) # Define the positions of the boxplots pos += 2 # Add separation between boxplots xlabels.append(feature) # Add the feature used at the x axis # Boxplot group #bp = boxplot(data, positions = positions, widths = 0.6) bp = pylab.boxplot(analysis_results[feature], positions=positions, widths=0.6, patch_artist=True) tick = np.mean( positions ) # The label will be at the mean of the positions (in the middle) xticks.append(tick) # Set axes limits and labels pylab.xlim(0, pos - 1) pylab.ylim(0, 1) ax.set_xticklabels(xlabels) ax.set_xticks(xticks) pylab.xlabel('Features') pylab.ylabel('Distribution of AUC values') fig.autofmt_xdate() pylab.savefig(plot_name, format=fig_format) pylab.show() # End marker for time end = time.time() print( '\n DIANA INFO:\tTIME OF EXECUTION: {:.3f} seconds or {:.3f} minutes.\n' .format(end - start, (end - start) / 60)) return
def analysis_results(options): """ Analyzes the results of the comparisons """ # Start marker for time measure start = time.time() print("\n\t\t----------------------------------------------------------------------------------------------------------------------------\n") print("\t\tStarting Drug Interactions ANAlysis (DIANA), a program created by @OLIVA'S LAB. Analysis of results: Selection of classifier\n") print("\t\t----------------------------------------------------------------------------------------------------------------------------\n") # Get the script path main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) toolbox_dir = os.path.join(main_path, 'diana/toolbox') # Check the directory of the profiles and comparisons data_dir = os.path.join(options.workspace, "profiles") check_directory(data_dir) results_dir = os.path.join(options.workspace, "comparisons") check_directory(results_dir) # Create a directory for the analysis inside the workspace analysis_dir = os.path.join(options.workspace, "analysis") create_directory(analysis_dir) # Create a directory for the analysis of the comparison with other methods if options.comparison_other_methods: analysis_dir = os.path.join(options.workspace, "analysis_comparison") create_directory(analysis_dir) # Get the list of thresholds to create the profiles if options.threshold_list and fileExist(options.threshold_list): threshold_list = get_values_from_threshold_file(options.threshold_list) else: threshold_list = [1, 5, 10, 20, 50] # Do we consider Side Effects/ATC? if options.consider_se: consider_se = True else: consider_se = False # Get the names of the columns columns = diana_analysis.obtain_columns(threshold_list, ATC_SE=consider_se) #-----------------------------------------------------# # PARSE THE RESULTS AND CREATE A PANDAS DATAFRAME # #-----------------------------------------------------# pair2comb_file = os.path.join(toolbox_dir, 'pair2comb.pcl') pair2comb = cPickle.load(open(pair2comb_file)) ddi = sum(1 for x in pair2comb.values() if x == 1) non_ddi = sum(1 for x in pair2comb.values() if x == 0) print('NUMBER OF DRUG COMBINATIONS:\t\t{}\n'.format(ddi)) print('NUMBER OF NON-DRUG COMBINATIONS:\t{}\n'.format(non_ddi)) output_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons.csv') # Change the name of the output file if we are doing a comparison with other methods if options.comparison_other_methods: output_dataframe = os.path.join(analysis_dir, 'comparison_other_methods.csv') if not fileExist(output_dataframe): # Create a data frame to store the results df = pd.DataFrame(columns=columns) # Prepare files network_filename = ntpath.basename(options.sif) drugbank2targets_file = os.path.join(toolbox_dir, 'drugbank_to_targets.pcl') drug2targets = cPickle.load(open(drugbank2targets_file)) # Open the crossings file crossings_file = options.crossings_file with open(crossings_file, 'r') as crossings_file_fd: for line in crossings_file_fd: crossing = line.strip() drug1, drug2 = crossing.split('---') # Get drug IDs targets1 = list(drug2targets[drug1.upper()]) drug_id1 = diana_drug.generate_drug_id(drug1, targets1, network_filename) targets2 = list(drug2targets[drug2.upper()]) drug_id2 = diana_drug.generate_drug_id(drug2, targets2, network_filename) # Check results table comparison = '{}---{}'.format(drug_id1, drug_id2) comparison_dir = os.path.join(results_dir, comparison) results_table = os.path.join(comparison_dir, 'results_table.tsv') if not fileExist(results_table): print('The comparison of {} ({}) and {} ({}) has not been executed properly!\n'.format(drug1, drug_id1, drug2, drug_id2)) sys.exit(10) if crossing in pair2comb: combination_field = pair2comb[crossing] else: print('The comparison {} is not in the pair2comb dictionary!\n'.format(crossing)) sys.exit(10) results = diana_analysis.get_results_from_table(results_table, columns, combination_field) df2 = pd.DataFrame([results], columns=columns, index=[comparison]) # Add the information to the main data frame df = df.append(df2) # Output the Pandas dataframe in a CSV file df.to_csv(output_dataframe) else: df = pd.read_csv(output_dataframe, index_col=0) #---------------------------# # REMOVE MISSING VALUES # #---------------------------# # Replace the None values in dcstructure by nan if 'None' in df['dcstructure']: df = df.replace(to_replace={'dcstructure':{'None':np.nan}}) # Remove the nan values in dcstructure df = df.dropna() # Count the number of drug combinations / non-drug combinations dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print('Number of drug combinations after removing missing values:\t{}\n'.format(num_dc)) print('Number of non-drug combinations after removing missing values:\t{}\n'.format(num_ndc)) #---------------------------# # IDENTIFY ME-TOO DRUGS # #---------------------------# me_too_dir = os.path.join(analysis_dir, 'me_too_drugs') create_directory(me_too_dir) me_too_drugs_table = os.path.join(me_too_dir, 'me_too_drugs.tsv') me_too_drug_combs_table = os.path.join(me_too_dir, 'me_too_drug_combinations.tsv') me_too_drug_pairs_file = os.path.join(me_too_dir, 'me_too_drug_pairs.pcl') me_too_drug_comb_pairs_file = os.path.join(me_too_dir, 'me_too_drug_comb_pairs.pcl') if not fileExist(me_too_drug_pairs_file) or not fileExist(me_too_drug_comb_pairs_file): df_struc = df[['dcstructure']] df_struc = df_struc.astype(float) me_too_drug_pairs, me_too_drug_comb_pairs = diana_analysis.obtain_me_too_drugs_and_combinations(df_struc, columns, me_too_drugs_table, me_too_drug_combs_table) cPickle.dump(me_too_drug_pairs, open(me_too_drug_pairs_file, 'w')) cPickle.dump(me_too_drug_comb_pairs, open(me_too_drug_comb_pairs_file, 'w')) else: me_too_drug_pairs = cPickle.load(open(me_too_drug_pairs_file)) me_too_drug_comb_pairs = cPickle.load(open(me_too_drug_comb_pairs_file)) # Process me-too drug combination pairs me_too_drug_combinations = set() drug_pair_to_me_too_times = {} for pair in me_too_drug_comb_pairs: drug_comb1, drug_comb2 = pair.split('___') me_too_drug_combinations.add(frozenset([drug_comb1, drug_comb2])) drug_pair_to_me_too_times.setdefault(drug_comb1, 0) drug_pair_to_me_too_times.setdefault(drug_comb2, 0) drug_pair_to_me_too_times[drug_comb1] += 1 drug_pair_to_me_too_times[drug_comb2] += 1 removed_drug_pairs = set() for pair in me_too_drug_comb_pairs: drug_comb1, drug_comb2 = pair.split('___') if drug_comb1 in removed_drug_pairs or drug_comb2 in removed_drug_pairs: continue if drug_pair_to_me_too_times[drug_comb1] > drug_pair_to_me_too_times[drug_comb2]: removed_drug_pairs.add(drug_comb1) else: removed_drug_pairs.add(drug_comb2) # Remove the drug pairs which appear in me-too pairs of drug pairs more times df = df.loc[~df.index.isin(list(removed_drug_pairs))] # Count the number of drug combinations / non-drug combinations dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print('Number of drug combinations after removing me-too conflictive drug pairs:\t{}\n'.format(num_dc)) print('Number of non-drug combinations after removing me-too conflictive drug pairs:\t{}\n'.format(num_ndc)) #-----------------------------------------------------------# # DIVIDE THE DATASET IN A TRAINING AND A VALIDATION SET # #-----------------------------------------------------------# training_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons_training.csv') validation_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons_validation.csv') proportion_training = 0.8 # Change the name of the output file if we are doing a comparison with other methods if options.comparison_other_methods: training_dataframe = os.path.join(analysis_dir, 'comparison_other_methods_training.csv') validation_dataframe = os.path.join(analysis_dir, 'comparison_other_methods_validation.csv') if not fileExist(training_dataframe) or not fileExist(validation_dataframe): num_dc_training = int(round(num_dc*proportion_training)) num_ndc_training = int(round(num_ndc*proportion_training)) print('Training set (positives): {} out of {} ({}%)\n'.format(num_dc_training, num_dc, proportion_training*100)) print('Training set (negatives): {} out of {} ({}%)\n'.format(num_ndc_training, num_ndc, proportion_training*100)) dc_data_training = dc_data.sample(n=num_dc_training) # Get a random sample ndc_data_training = ndc_data.sample(n=num_ndc_training) dc_data_validation = dc_data.loc[~dc_data.index.isin(dc_data_training.index)] # Remove the sample that we have taken from the dataframe ndc_data_validation = ndc_data.loc[~ndc_data.index.isin(ndc_data_training.index)] df_training = pd.concat([dc_data_training, ndc_data_training]) df_validation = pd.concat([dc_data_validation, ndc_data_validation]) # Output the Pandas dataframes in a CSV file df_training.to_csv(training_dataframe) df_validation.to_csv(validation_dataframe) # Define the variables for the training dataset df = df_training dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print('Number of drug combinations after getting the training dataset:\t{}\n'.format(num_dc)) print('Number of non-drug combinations after getting the training dataset:\t{}\n'.format(num_ndc)) else: df_training = pd.read_csv(training_dataframe, index_col=0) df_validation = pd.read_csv(validation_dataframe, index_col=0) # Define the variables for the training dataset df = df_training dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print('Number of drug combinations after getting the training dataset:\t{}\n'.format(num_dc)) print('Number of non-drug combinations after getting the training dataset:\t{}\n'.format(num_ndc)) #------------------------------------------------------------------# # SELECT RELEVANT FEATURES / REDUCE DIMENSIONALITY OF THE DATA # #------------------------------------------------------------------# if options.pca: # Strategy: # We calculate the explained variance ratio for all the features. # We define a a cut-off threshold of the minimum variance ratio that we consider relevant. # We will count the number of features with explained variance higher than the cut-off defined. # Then, we will reduce the dimensionality to the number of features with variance higher than the cut-off. variance_cut_off = 0.01 num_components = 0 df_raw = df.drop('combination', axis=1) raw_columns = copy.copy(columns) raw_columns.remove('combination') pca = PCA(n_components=None) pca.fit(df_raw) values_trans = pca.transform(df_raw) explained_variance = pca.explained_variance_ratio_ for var in explained_variance: if var > variance_cut_off: num_components += 1 if num_components < len(raw_columns): print('Number of features:\t{}\n'.format(len(raw_columns))) print('Reduction to {} components\n'.format(num_components)) pca = PCA(n_components=num_components) pca.fit(df_raw) values_trans = pca.transform(df_raw) indexes = df.index.values df_trans = pd.DataFrame.from_records(values_trans, index=indexes) df_comb = df[['combination']] df_new = pd.concat([df_trans, df_comb], axis=1) df = df_new dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) else: # Manually introduced features guild_thresholds = [1, 5] rank_scoring = ['spearman', 'dot_product'] list_scoring = ['jaccard'] selected_columns = diana_analysis.obtain_columns_best_features(guild_thresholds, rank_scoring, list_scoring, ATC_SE=consider_se) print('Selected columns: {}\n'.format(', '.join(selected_columns))) print('Number of selected features: {}\n'.format(len(selected_columns)-1)) # We take away the combinations column # Define the new table with the selected columns df = df[selected_columns] dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) #------------------------------------------# # TUNE THE ALGORITHM OF THE CLASSIFIER # #------------------------------------------# tables_dir = os.path.join(analysis_dir, 'tables') create_directory(tables_dir) results_table = os.path.join(tables_dir, 'tuning_results.tsv') classifiers = { 'KNeighbors' : KNeighborsClassifier(3), 'SVC' : SVC(probability=True), 'SVC linear' : SVC(kernel="linear", C=0.025), 'SVC rbf' : SVC(gamma=2, C=1), 'DecisionTree' : DecisionTreeClassifier(max_depth=5), 'RandomForest' : RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 'MLP' : MLPClassifier(alpha=1), 'AdaBoost' : AdaBoostClassifier(), 'GaussianNB' : GaussianNB(), 'QuadraticDiscr.' : QuadraticDiscriminantAnalysis() } classifier = 'SVC' pipe_svc = Pipeline([('slc', StandardScaler()), ('clf', SVC(random_state=1))]) param_range_c = [1.0, 10.0, 100] param_range_gamma = [1e-4, 1e-3, 0.01, 0.1] param_grid = [{'clf__C': param_range_c, 'clf__kernel': ['linear']}, {'clf__C': param_range_c, 'clf__gamma': param_range_gamma, 'clf__kernel': ['rbf']}] print('TUNNING THE ALGORITHM OF {}\n'.format(classifier.upper())) rounds = 2 repetitions = 25 dict_results = {} for n_round in xrange(rounds): print('ROUND NUMBER {}\n'.format(n_round+1)) # Obtain the different non-drug combination groups to repeat the analysis ndc_training_groups = diana_analysis.obtain_n_groups_of_k_length(ndc_data, repetitions, num_dc) # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times for ndc_training_data in ndc_training_groups: merged_groups = pd.concat([dc_data, ndc_training_data]) X_train, y_train = merged_groups.iloc[:, :-1], merged_groups.iloc[:, -1] grid_search = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1) grid = grid_search.fit(X_train, y_train) print(grid) # summarize the results of the grid search print('Grid best score: {}'.format(grid.best_score_)) result = str(grid.best_params_) print('Grid best parameters: {}\n'.format(result)) dict_results.setdefault(result, 0) dict_results[result] += 1 print('\nFINAL RESULT\n') with open(results_table, 'w') as results_table_fd: for param_comb in sorted(dict_results, reverse = True): print('{}\t{}\n'.format(param_comb, dict_results[param_comb])) results_table_fd.write('{}\t{}\n'.format(param_comb, dict_results[param_comb])) # End marker for time end = time.time() print('\n DIANA INFO:\tTIME OF EXECUTION: {:.3f} seconds or {:.3f} minutes.\n'.format(end - start, (end - start) / 60)) return
def analysis_results(options): """ Analyzes the results of the comparisons """ # Start marker for time measure start = time.time() print( "\n\t\t----------------------------------------------------------------------------------------------------------------------------\n" ) print( "\t\tStarting Drug Interactions ANAlysis (DIANA), a program created by @OLIVA'S LAB. Analysis of results: Selection of classifier\n" ) print( "\t\t----------------------------------------------------------------------------------------------------------------------------\n" ) # Get the script path main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) toolbox_dir = os.path.join(main_path, 'diana/toolbox') # Check the directory of the profiles and comparisons data_dir = os.path.join(options.workspace, "profiles") check_directory(data_dir) results_dir = os.path.join(options.workspace, "comparisons") check_directory(results_dir) # Create a directory for the analysis inside the workspace analysis_dir = os.path.join(options.workspace, "analysis") create_directory(analysis_dir) # Create a directory for the analysis of the comparison with other methods if options.comparison_other_methods: analysis_dir = os.path.join(options.workspace, "analysis_comparison") create_directory(analysis_dir) # Get the list of thresholds to create the profiles if options.threshold_list and fileExist(options.threshold_list): threshold_list = get_values_from_threshold_file(options.threshold_list) else: threshold_list = [1, 5, 10, 20, 50] # Do we consider Side Effects/ATC? if options.consider_se: consider_se = True else: consider_se = False # Get the names of the columns columns = diana_analysis.obtain_columns(threshold_list, ATC_SE=consider_se) #-----------------------------------------------------# # PARSE THE RESULTS AND CREATE A PANDAS DATAFRAME # #-----------------------------------------------------# pair2comb_file = os.path.join(toolbox_dir, 'pair2comb.pcl') pair2comb = cPickle.load(open(pair2comb_file)) ddi = sum(1 for x in pair2comb.values() if x == 1) non_ddi = sum(1 for x in pair2comb.values() if x == 0) print('NUMBER OF DRUG COMBINATIONS:\t\t{}\n'.format(ddi)) print('NUMBER OF NON-DRUG COMBINATIONS:\t{}\n'.format(non_ddi)) output_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons.csv') # Change the name of the output file if we are doing a comparison with other methods if options.comparison_other_methods: output_dataframe = os.path.join(analysis_dir, 'comparison_other_methods.csv') if not fileExist(output_dataframe): # Create a data frame to store the results df = pd.DataFrame(columns=columns) # Prepare files network_filename = ntpath.basename(options.sif) drugbank2targets_file = os.path.join(toolbox_dir, 'drugbank_to_targets.pcl') drug2targets = cPickle.load(open(drugbank2targets_file)) # Open the crossings file crossings_file = options.crossings_file with open(crossings_file, 'r') as crossings_file_fd: for line in crossings_file_fd: crossing = line.strip() drug1, drug2 = crossing.split('---') # Get drug IDs targets1 = list(drug2targets[drug1.upper()]) drug_id1 = diana_drug.generate_drug_id(drug1, targets1, network_filename) targets2 = list(drug2targets[drug2.upper()]) drug_id2 = diana_drug.generate_drug_id(drug2, targets2, network_filename) # Check results table comparison = '{}---{}'.format(drug_id1, drug_id2) comparison_dir = os.path.join(results_dir, comparison) results_table = os.path.join(comparison_dir, 'results_table.tsv') if not fileExist(results_table): print( 'The comparison of {} ({}) and {} ({}) has not been executed properly!\n' .format(drug1, drug_id1, drug2, drug_id2)) sys.exit(10) if crossing in pair2comb: combination_field = pair2comb[crossing] else: print( 'The comparison {} is not in the pair2comb dictionary!\n' .format(crossing)) sys.exit(10) results = diana_analysis.get_results_from_table( results_table, columns, combination_field) df2 = pd.DataFrame([results], columns=columns, index=[comparison]) # Add the information to the main data frame df = df.append(df2) # Output the Pandas dataframe in a CSV file df.to_csv(output_dataframe) else: df = pd.read_csv(output_dataframe, index_col=0) #---------------------------# # REMOVE MISSING VALUES # #---------------------------# # Replace the None values in dcstructure by nan if 'None' in df['dcstructure']: df = df.replace(to_replace={'dcstructure': {'None': np.nan}}) # Remove the nan values in dcstructure df = df.dropna() # Count the number of drug combinations / non-drug combinations dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print('Number of drug combinations after removing missing values:\t{}\n'. format(num_dc)) print( 'Number of non-drug combinations after removing missing values:\t{}\n'. format(num_ndc)) #---------------------------# # IDENTIFY ME-TOO DRUGS # #---------------------------# me_too_dir = os.path.join(analysis_dir, 'me_too_drugs') create_directory(me_too_dir) me_too_drugs_table = os.path.join(me_too_dir, 'me_too_drugs.tsv') me_too_drug_combs_table = os.path.join(me_too_dir, 'me_too_drug_combinations.tsv') me_too_drug_pairs_file = os.path.join(me_too_dir, 'me_too_drug_pairs.pcl') me_too_drug_comb_pairs_file = os.path.join(me_too_dir, 'me_too_drug_comb_pairs.pcl') if not fileExist(me_too_drug_pairs_file) or not fileExist( me_too_drug_comb_pairs_file): df_struc = df[['dcstructure']] df_struc = df_struc.astype(float) me_too_drug_pairs, me_too_drug_comb_pairs = diana_analysis.obtain_me_too_drugs_and_combinations( df_struc, columns, me_too_drugs_table, me_too_drug_combs_table) cPickle.dump(me_too_drug_pairs, open(me_too_drug_pairs_file, 'w')) cPickle.dump(me_too_drug_comb_pairs, open(me_too_drug_comb_pairs_file, 'w')) else: me_too_drug_pairs = cPickle.load(open(me_too_drug_pairs_file)) me_too_drug_comb_pairs = cPickle.load( open(me_too_drug_comb_pairs_file)) # Process me-too drug combination pairs me_too_drug_combinations = set() drug_pair_to_me_too_times = {} for pair in me_too_drug_comb_pairs: drug_comb1, drug_comb2 = pair.split('___') me_too_drug_combinations.add(frozenset([drug_comb1, drug_comb2])) drug_pair_to_me_too_times.setdefault(drug_comb1, 0) drug_pair_to_me_too_times.setdefault(drug_comb2, 0) drug_pair_to_me_too_times[drug_comb1] += 1 drug_pair_to_me_too_times[drug_comb2] += 1 removed_drug_pairs = set() for pair in me_too_drug_comb_pairs: drug_comb1, drug_comb2 = pair.split('___') if drug_comb1 in removed_drug_pairs or drug_comb2 in removed_drug_pairs: continue if drug_pair_to_me_too_times[drug_comb1] > drug_pair_to_me_too_times[ drug_comb2]: removed_drug_pairs.add(drug_comb1) else: removed_drug_pairs.add(drug_comb2) # Remove the drug pairs which appear in me-too pairs of drug pairs more times df = df.loc[~df.index.isin(list(removed_drug_pairs))] # Count the number of drug combinations / non-drug combinations dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print( 'Number of drug combinations after removing me-too conflictive drug pairs:\t{}\n' .format(num_dc)) print( 'Number of non-drug combinations after removing me-too conflictive drug pairs:\t{}\n' .format(num_ndc)) #-----------------------------------------------------------# # DIVIDE THE DATASET IN A TRAINING AND A VALIDATION SET # #-----------------------------------------------------------# training_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons_training.csv') validation_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons_validation.csv') proportion_training = 0.8 # Change the name of the output file if we are doing a comparison with other methods if options.comparison_other_methods: training_dataframe = os.path.join( analysis_dir, 'comparison_other_methods_training.csv') validation_dataframe = os.path.join( analysis_dir, 'comparison_other_methods_validation.csv') if not fileExist(training_dataframe) or not fileExist( validation_dataframe): num_dc_training = int(round(num_dc * proportion_training)) num_ndc_training = int(round(num_ndc * proportion_training)) print('Training set (positives): {} out of {} ({}%)\n'.format( num_dc_training, num_dc, proportion_training * 100)) print('Training set (negatives): {} out of {} ({}%)\n'.format( num_ndc_training, num_ndc, proportion_training * 100)) dc_data_training = dc_data.sample( n=num_dc_training) # Get a random sample ndc_data_training = ndc_data.sample(n=num_ndc_training) dc_data_validation = dc_data.loc[~dc_data.index.isin( dc_data_training.index )] # Remove the sample that we have taken from the dataframe ndc_data_validation = ndc_data.loc[~ndc_data.index. isin(ndc_data_training.index)] df_training = pd.concat([dc_data_training, ndc_data_training]) df_validation = pd.concat([dc_data_validation, ndc_data_validation]) # Output the Pandas dataframes in a CSV file df_training.to_csv(training_dataframe) df_validation.to_csv(validation_dataframe) # Define the variables for the training dataset df = df_training dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print( 'Number of drug combinations after getting the training dataset:\t{}\n' .format(num_dc)) print( 'Number of non-drug combinations after getting the training dataset:\t{}\n' .format(num_ndc)) else: df_training = pd.read_csv(training_dataframe, index_col=0) df_validation = pd.read_csv(validation_dataframe, index_col=0) # Define the variables for the training dataset df = df_training dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print( 'Number of drug combinations after getting the training dataset:\t{}\n' .format(num_dc)) print( 'Number of non-drug combinations after getting the training dataset:\t{}\n' .format(num_ndc)) #------------------------------------------------------------------# # SELECT RELEVANT FEATURES / REDUCE DIMENSIONALITY OF THE DATA # #------------------------------------------------------------------# if options.pca: # Strategy: # We calculate the explained variance ratio for all the features. # We define a a cut-off threshold of the minimum variance ratio that we consider relevant. # We will count the number of features with explained variance higher than the cut-off defined. # Then, we will reduce the dimensionality to the number of features with variance higher than the cut-off. variance_cut_off = 0.01 num_components = 0 df_raw = df.drop('combination', axis=1) raw_columns = copy.copy(columns) raw_columns.remove('combination') pca = PCA(n_components=None) pca.fit(df_raw) values_trans = pca.transform(df_raw) explained_variance = pca.explained_variance_ratio_ for var in explained_variance: if var > variance_cut_off: num_components += 1 if num_components < len(raw_columns): print('Number of features:\t{}\n'.format(len(raw_columns))) print('Reduction to {} components\n'.format(num_components)) pca = PCA(n_components=num_components) pca.fit(df_raw) values_trans = pca.transform(df_raw) indexes = df.index.values df_trans = pd.DataFrame.from_records(values_trans, index=indexes) df_comb = df[['combination']] df_new = pd.concat([df_trans, df_comb], axis=1) df = df_new dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) else: # Strategy: # We calculate the variance for each feature tables_dir = os.path.join(analysis_dir, 'tables') create_directory(tables_dir) variance_features_file = os.path.join(tables_dir, 'variance_features.txt') variance_cut_off = 0.01 if not fileExist(variance_features_file): df_raw = df.drop('combination', axis=1) raw_columns = copy.copy(columns) raw_columns.remove('combination') from sklearn.feature_selection import VarianceThreshold selector = VarianceThreshold(variance_cut_off) selector.fit(df_raw) variances = selector.variances_ print(variances) new_cols = [] for x in xrange(len(raw_columns)): col = raw_columns[x] var = variances[x] if var > variance_cut_off: new_cols.append(col) #df_new = df[new_cols] #print(df_new) #print(list(df_new.columns.values)) with open(variance_features_file, 'w') as variance_fd: zipped = zip(raw_columns, variances) for col, val in sorted(zipped, key=lambda (x, y): y, reverse=True): print(col, val) variance_fd.write('{}\t{}\n'.format(col, val)) correlation_features_file = os.path.join(tables_dir, 'correlation_thresholds.txt') correlation_scoring_file = os.path.join(tables_dir, 'correlation_scoring.txt') correlation_scoring_file_guild = os.path.join( tables_dir, 'correlation_scoring_guild.txt') if not fileExist(correlation_features_file) or not fileExist( correlation_scoring_file): df_raw = df.drop('combination', axis=1) raw_columns = copy.copy(columns) raw_columns.remove('combination') from scipy.stats import pearsonr comp_to_corr = {} for thr1 in threshold_list: for thr2 in threshold_list: if thr1 != thr2: for data_type in ['node', 'edge', 'function']: for scoring_function in [ 'dot_product', 'spearman', 'jaccard' ]: col1 = 'dcg' + '_' + data_type + '_' + str( thr1) + '_' + scoring_function col2 = 'dcg' + '_' + data_type + '_' + str( thr2) + '_' + scoring_function values1 = df_raw[col1] values2 = df_raw[col2] pcorr, pvalue = pearsonr(values1, values2) comp_to_corr[' '.join( [str(x) for x in sorted([thr1, thr2])]) + ' ' + data_type + ' ' + scoring_function] = (pcorr, pvalue) with open(correlation_features_file, 'w') as correlation_fd: for comp, corr in sorted(comp_to_corr.iteritems(), key=lambda (x, y): y[0], reverse=True): print(comp, corr[0]) correlation_fd.write('{}\t{}\t{}\n'.format( comp, corr[0], corr[1])) comp_to_corr = {} for sc1 in ['dot_product', 'spearman', 'jaccard']: for sc2 in ['dot_product', 'spearman', 'jaccard']: if sc1 != sc2: for data_type in ['target', 'pfam', 'function']: col1 = 'dct' + '_' + data_type + '_' + sc1 col2 = 'dct' + '_' + data_type + '_' + sc2 values1 = df_raw[col1] values2 = df_raw[col2] pcorr, pvalue = pearsonr(values1, values2) comp_to_corr[' '.join(sorted([sc1, sc2])) + ' ' + 'targets' + ' ' + data_type] = (pcorr, pvalue) for method in ['dcatc', 'dcse']: col1 = method + '_' + sc1 col2 = method + '_' + sc2 values1 = df_raw[col1] values2 = df_raw[col2] pcorr, pvalue = pearsonr(values1, values2) comp_to_corr[' '.join(sorted([sc1, sc2])) + ' ' + method] = (pcorr, pvalue) with open(correlation_scoring_file, 'w') as correlation_fd: for comp, corr in sorted(comp_to_corr.iteritems(), key=lambda (x, y): y[0], reverse=True): print(comp, corr[0]) correlation_fd.write('{}\t{}\t{}\n'.format( comp, corr[0], corr[1])) comp_to_corr = {} for sc1 in ['dot_product', 'spearman', 'jaccard']: for sc2 in ['dot_product', 'spearman', 'jaccard']: if sc1 != sc2: for threshold in threshold_list: for data_type in ['node', 'edge', 'function']: col1 = 'dcg' + '_' + data_type + '_' + str( threshold) + '_' + sc1 col2 = 'dcg' + '_' + data_type + '_' + str( threshold) + '_' + sc2 values1 = df_raw[col1] values2 = df_raw[col2] pcorr, pvalue = pearsonr(values1, values2) comp_to_corr[' '.join(sorted([sc1, sc2])) + ' ' + str(threshold) + ' ' + data_type] = (pcorr, pvalue) with open(correlation_scoring_file_guild, 'w') as correlation_fd: for comp, corr in sorted(comp_to_corr.iteritems(), key=lambda (x, y): y[0], reverse=True): print(comp, corr[0]) correlation_fd.write('{}\t{}\t{}\n'.format( comp, corr[0], corr[1])) forest_features_file = os.path.join(tables_dir, 'forest_importances.txt') if not fileExist(forest_features_file): X, y = df.iloc[:, :-1], df.iloc[:, -1] raw_columns = copy.copy(columns) raw_columns.remove('combination') from sklearn.ensemble import ExtraTreesClassifier from sklearn.datasets import load_iris from sklearn.feature_selection import SelectFromModel clf = ExtraTreesClassifier() clf = clf.fit(X, y) importances = clf.feature_importances_ zipped = zip(raw_columns, importances) with open(forest_features_file, 'w') as forest_fd: for col, val in sorted(zipped, key=lambda (x, y): y, reverse=True): print(col, val) forest_fd.write('{}\t{}\n'.format(col, val)) # Manually introduced features guild_thresholds = [1, 5] rank_scoring = ['spearman', 'dot_product'] list_scoring = ['jaccard'] selected_columns = diana_analysis.obtain_columns_best_features( guild_thresholds, rank_scoring, list_scoring, ATC_SE=consider_se) print('Selected columns: {}\n'.format(', '.join(selected_columns))) print('Number of selected features: {}\n'.format( len(selected_columns) - 1)) # We take away the combinations column # Define the new table with the selected columns df = df[selected_columns] dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) #--------------------------# # EVALUATE CLASSIFIERS # #--------------------------# if options.pca: pca_str = '_withPCA' else: pca_str = '_withoutPCA' img_dir = os.path.join(analysis_dir, 'figures') create_directory(img_dir) fig_format = 'png' plot_name = os.path.join( img_dir, 'evaluation_classifiers{}.{}'.format(pca_str, fig_format)) classifiers = { 'KNeighbors': KNeighborsClassifier(), 'SVC rbf': SVC(kernel="rbf"), 'SVC linear': SVC(kernel="linear"), 'DecisionTree': DecisionTreeClassifier(), 'RandomForest': RandomForestClassifier(), 'MLP': MLPClassifier(), 'AdaBoost': AdaBoostClassifier(), 'GaussianNB': GaussianNB(), 'QuadraticDiscr.': QuadraticDiscriminantAnalysis() } classifiers_order = [ 'AdaBoost', 'DecisionTree', 'GaussianNB', 'KNeighbors', 'MLP', 'QuadraticDiscr.', 'RandomForest', 'SVC rbf', 'SVC linear' ] repetitions = 25 n_fold = 10 if not fileExist(plot_name): print('\nEVALUATION OF THE CLASSIFIERS\n') analysis_results = {} classifier_to_results = {} # Obtain the different non-drug combination groups to repeat the analysis ndc_repetitions = diana_analysis.obtain_n_groups_of_k_length( ndc_data, repetitions, num_dc ) # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times for classifier in classifiers: print('Classifier: {}\n'.format(classifier)) mean_aucs = [ ] # Here we will store the means of AUCs from the cross-validations std_aucs = [ ] # Here we will store the standard deviations of the AUCs from the cross-validations all_aucs = [] # Here we will store ALL the AUCs for ndc_data_equal in ndc_repetitions: num_items_group = int( float(num_dc) / float(n_fold) ) # Calculate the number of items in each group of the cross-validation dc_groups = diana_analysis.obtain_n_groups_of_k_length( dc_data, n_fold, num_items_group, me_too_drug_combinations ) # Defining the drug combination groups in each cross-validation step ndc_groups = diana_analysis.obtain_n_groups_of_k_length( ndc_data_equal, n_fold, num_items_group, me_too_drug_combinations ) # Defining the non-drug combination groups in each cross-validation step merged_groups = [ pd.concat([x, y]) for x, y in zip(dc_groups, ndc_groups) ] mean, var, std, list_auc = diana_analysis.run_nfold_crossvalidation_scikit( n_fold, merged_groups, classifiers[classifier]) mean_aucs.append(mean) std_aucs.append(std) all_aucs = all_aucs + list_auc #final_mean = np.mean(mean_aucs) final_mean = np.mean(all_aucs) std = np.std(all_aucs) mean_std = np.mean(std_aucs) std_means = np.std(mean_aucs) print('FINAL MEAN: {}'.format(final_mean)) print('STD: {}\n'.format(std)) #print('STD of MEANS: {}\n'.format(std_means)) #print('MEAN of STD: {}'.format(mean_std)) # Store the distribution of AUCs in the dictionary analysis_results[classifier] = all_aucs classifier_to_results[classifier] = (final_mean, std) print(analysis_results) #---------------------------------------------# # PLOT DISTRIBUTION OF AUC PER CLASSIFIER # #---------------------------------------------# fig = pylab.figure(dpi=300) ax = pylab.axes() #pylab.hold(True) pos = 1 col_num = 0 xticks = [] # Define the places in which the labels will be xlabels = [] # Define the labels (the names of the classifiers) #colors = [ ['#9ed0ff, blue'], ['#32f232', 'green'], ['#fbc562', '#d48900'], ['#ff7373', '#b80000'], ['grey', 'black'] ] for classifier in classifiers_order: positions = [] positions.append(pos) # Define the positions of the boxplots pos += 2 # Add separation between boxplots xlabels.append(classifier) # Add the classifier used at the x axis # Boxplot group #bp = boxplot(data, positions = positions, widths = 0.6) bp = pylab.boxplot(analysis_results[classifier], positions=positions, widths=0.6, patch_artist=True) tick = np.mean( positions ) # The label will be at the mean of the positions (in the middle) xticks.append(tick) # Set axes limits and labels pylab.xlim(0, pos - 1) pylab.ylim(0, 1) ax.set_xticklabels(xlabels) ax.set_xticks(xticks) pylab.xlabel('Classifiers') pylab.ylabel('Distribution of AUC values') fig.autofmt_xdate() pylab.savefig(plot_name, format=fig_format) pylab.show() #---------------------------------# # PRINT THE RESULTS IN A FILE # #---------------------------------# tables_dir = os.path.join(analysis_dir, 'tables') create_directory(tables_dir) output_file = os.path.join( tables_dir, 'evaluation_classifiers{}.txt'.format(pca_str)) with open(output_file, 'w') as output_fd: for classifier, results in sorted( classifier_to_results.iteritems(), key=lambda (x, y): y[0], reverse=True): output_fd.write('{}\t{}\t{}\n'.format(classifier, results[0], results[1])) # End marker for time end = time.time() print( '\n DIANA INFO:\tTIME OF EXECUTION: {:.3f} seconds or {:.3f} minutes.\n' .format(end - start, (end - start) / 60)) return
def analysis_results(options): """ Analyzes the results of the comparisons """ # Start marker for time measure start = time.time() print("\n\t\t-------------------------------------------------------------------------------------------------------------------------------\n") print("\t\tStarting Drug Interactions ANAlysis (DIANA), a program created by @OLIVA'S LAB. Analysis of results: Classify drug combinations\n") print("\t\t-------------------------------------------------------------------------------------------------------------------------------\n") # Get the script path main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) toolbox_dir = os.path.join(main_path, 'diana/toolbox') # Check the directory of the profiles, comparisons and analysis data_dir = os.path.join(options.workspace, "profiles") check_directory(data_dir) results_dir = os.path.join(options.workspace, "comparisons") check_directory(results_dir) analysis_dir = os.path.join(options.workspace, "analysis") check_directory(analysis_dir) # Get the list of thresholds to create the profiles if options.threshold_list and fileExist(options.threshold_list): threshold_list = get_values_from_threshold_file(options.threshold_list) else: threshold_list = [1, 5, 10, 20, 50] # Do we consider Side Effects/ATC? if options.consider_se: consider_se = True else: consider_se = False # Get the names of the columns columns = diana_analysis.obtain_columns(threshold_list, ATC_SE=consider_se) #-----------------------------------------------------# # PARSE THE RESULTS AND CREATE A PANDAS DATAFRAME # #-----------------------------------------------------# pair2comb_file = os.path.join(toolbox_dir, 'pair2comb.pcl') pair2comb = cPickle.load(open(pair2comb_file)) diana_id_to_drugbank_file = os.path.join(toolbox_dir, 'diana_id_to_drugbank.pcl') diana_id_to_drugbank = cPickle.load(open(diana_id_to_drugbank_file)) ddi = sum(1 for x in pair2comb.values() if x == 1) non_ddi = sum(1 for x in pair2comb.values() if x == 0) print('NUMBER OF DRUG COMBINATIONS:\t\t{}\n'.format(ddi)) print('NUMBER OF NON-DRUG COMBINATIONS:\t{}\n'.format(non_ddi)) output_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons.csv') if not fileExist(output_dataframe): # Create a data frame to store the results df = pd.DataFrame(columns=columns) # Obtain all the results subfolders of the results main folder results_dir_list = [f for f in os.listdir(results_dir) if os.path.isdir(os.path.join(results_dir, f))] for comparison in results_dir_list: drug_id1, drug_id2 = comparison.split('---') comparison_dir = os.path.join(results_dir, comparison) results_table = os.path.join(comparison_dir, 'results_table.tsv') # Add the Comb field (if it is drug combination or not) drug1 = diana_id_to_drugbank[drug_id1].upper() drug2 = diana_id_to_drugbank[drug_id2].upper() comparison_without_id = '{}---{}'.format(drug1, drug2) if comparison_without_id in pair2comb: combination_field = pair2comb[comparison_without_id] else: print('The comparison {} is not in the pair2comb dictionary!\n'.format(comparison_without_id)) print(pair2comb) sys.exit(10) if not fileExist(results_table): print('The comparison {} has not been executed properly!\n'.format(comparison)) sys.exit(10) results = diana_analysis.get_results_from_table(results_table, columns, combination_field) df2 = pd.DataFrame([results], columns=columns, index=[comparison]) # Add the information to the main data frame df = df.append(df2) # Output the Pandas dataframe in a CSV file df.to_csv(output_dataframe) else: df = pd.read_csv(output_dataframe, index_col=0) #---------------------------# # REMOVE MISSING VALUES # #---------------------------# # Replace the None values in dcstructure by nan if 'None' in df['dcstructure']: df = df.replace(to_replace={'dcstructure':{'None':np.nan}}) # Remove the nan values in dcstructure df = df.dropna() # Count the number of drug combinations / non-drug combinations dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print('Number of drug combinations after removing missing values:\t{}\n'.format(num_dc)) print('Number of non-drug combinations after removing missing values:\t{}\n'.format(num_ndc)) #---------------------------# # IDENTIFY ME-TOO DRUGS # #---------------------------# me_too_dir = os.path.join(analysis_dir, 'me_too_drugs') create_directory(me_too_dir) me_too_drugs_table = os.path.join(me_too_dir, 'me_too_drugs.tsv') me_too_drug_combs_table = os.path.join(me_too_dir, 'me_too_drug_combinations.tsv') me_too_drug_pairs_file = os.path.join(me_too_dir, 'me_too_drug_pairs.pcl') me_too_drug_comb_pairs_file = os.path.join(me_too_dir, 'me_too_drug_comb_pairs.pcl') if not fileExist(me_too_drug_pairs_file) or not fileExist(me_too_drug_comb_pairs_file): df_struc = df[['dcstructure']] df_struc = df_struc.astype(float) me_too_drug_pairs, me_too_drug_comb_pairs = diana_analysis.obtain_me_too_drugs_and_combinations(df_struc, columns, me_too_drugs_table, me_too_drug_combs_table) cPickle.dump(me_too_drug_pairs, open(me_too_drug_pairs_file, 'w')) cPickle.dump(me_too_drug_comb_pairs, open(me_too_drug_comb_pairs_file, 'w')) else: me_too_drug_pairs = cPickle.load(open(me_too_drug_pairs_file)) me_too_drug_comb_pairs = cPickle.load(open(me_too_drug_comb_pairs_file)) # Process me-too drug combination pairs me_too_drug_combinations = set() drug_pair_to_me_too_times = {} for pair in me_too_drug_comb_pairs: drug_comb1, drug_comb2 = pair.split('___') me_too_drug_combinations.add(frozenset([drug_comb1, drug_comb2])) drug_pair_to_me_too_times.setdefault(drug_comb1, 0) drug_pair_to_me_too_times.setdefault(drug_comb2, 0) drug_pair_to_me_too_times[drug_comb1] += 1 drug_pair_to_me_too_times[drug_comb2] += 1 removed_drug_pairs = set() for pair in me_too_drug_comb_pairs: drug_comb1, drug_comb2 = pair.split('___') if drug_comb1 in removed_drug_pairs or drug_comb2 in removed_drug_pairs: continue if drug_pair_to_me_too_times[drug_comb1] > drug_pair_to_me_too_times[drug_comb2]: removed_drug_pairs.add(drug_comb1) else: removed_drug_pairs.add(drug_comb2) # Remove the drug pairs which appear in me-too pairs of drug pairs more times df = df.loc[~df.index.isin(list(removed_drug_pairs))] # Count the number of drug combinations / non-drug combinations dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print('Number of drug combinations after removing me-too conflictive drug pairs:\t{}\n'.format(num_dc)) print('Number of non-drug combinations after removing me-too conflictive drug pairs:\t{}\n'.format(num_ndc)) img_dir = os.path.join(analysis_dir, 'figures') create_directory(img_dir) fig_format = 'png' #-----------------------------------------------------# # PLOT DISTRIBUTION OF NUMBER OF TARGETS PER DRUG # #-----------------------------------------------------# # Plot distribution of comparisons of targets drugbank2targets_file = os.path.join(toolbox_dir, 'drugbank_to_targets.pcl') drugbank_to_targets = cPickle.load(open(drugbank2targets_file)) plot_distribution_targets = os.path.join(img_dir, 'distribution_number_targets.{}'.format(fig_format)) targets = [len(x) for x in drugbank_to_targets.values()] n, bins, patches = plt.hist(np.array(targets), bins=50, weights=np.zeros_like(np.array(targets)) + 1. / np.array(targets).size, facecolor='r') plt.xlabel('Number of targets per drug') plt.ylabel('Relative frequency') plt.title('Distribution of the number of targets per drug') plt.savefig(plot_distribution_targets, format=fig_format, dpi=300) plt.clf() #----------------------------------------------------------------------------------------------# # EVALUATE OVERLAP BETWEEN TARGETS, BIOLOGICAL PROCESSES AND PATHWAYS IN DRUG COMBINATIONS # #----------------------------------------------------------------------------------------------# tables_dir = os.path.join(analysis_dir, 'tables') create_directory(tables_dir) if options.formula != 'jaccard' and options.formula != 'simpson': print('Please, introduce a correct formula to classify drug combinations: jaccard or simpson!\n') sys.exit(10) # Plot of distribution of comparisons of Targets plot_ji_targets = os.path.join(img_dir, 'distribution_{}_index_targets.{}'.format(options.formula, fig_format)) # Plot of distribution of comparisons of Biological Processes plot_ji_bp = os.path.join(img_dir, 'distribution_{}_index_biological_processes.{}'.format(options.formula, fig_format)) # Plot of distribution of comparisons of Pathways plot_ji_pathways = os.path.join(img_dir, 'distribution_{}_index_pathways.{}'.format(options.formula, fig_format)) # Output pickle file of the classification classification_targets_bp_file = os.path.join(toolbox_dir, 'classification_targets_bp.pcl') classification_targets_pathways_file = os.path.join(toolbox_dir, 'classification_targets_pathways.pcl') # Get the classification files drug_int_2_drugs_file = os.path.join(toolbox_dir, 'drug_int_2_drugs.pcl') drug_int_2_drugs = cPickle.load(open(drug_int_2_drugs_file)) drug_int_2_info_file = os.path.join(toolbox_dir, 'drug_int_2_info.pcl') drug_int_2_info = cPickle.load(open(drug_int_2_info_file)) drugbank_to_dcdb_file = os.path.join(toolbox_dir, 'drugbank_to_dcdb.pcl') drugbank_to_dcdb = cPickle.load(open(drugbank_to_dcdb_file)) bio_processes_file = os.path.join(toolbox_dir, 'target_to_bio_processes.pcl') target_to_bio_processes = cPickle.load(open(bio_processes_file)) pathways_file = os.path.join(toolbox_dir, 'target_to_pathways.pcl') target_to_pathways = cPickle.load(open(pathways_file)) target_comparisons = [] bp_comparisons = [] pathway_comparisons = [] dc_to_target_ji = {} dc_to_bp_ji = {} dc_to_pathway_ji = {} all_drugs = set() for index, row in dc_data.iterrows(): (drug_id1, drug_id2) = index.split('---') drug1 = diana_id_to_drugbank[drug_id1].upper() drug2 = diana_id_to_drugbank[drug_id2].upper() all_drugs.add(drug1) all_drugs.add(drug2) if drug1 in drugbank_to_targets and drug2 in drugbank_to_targets: targets1 = drugbank_to_targets[drug1] targets2 = drugbank_to_targets[drug2] if options.formula == 'jaccard': result_targets = diana_comparison.calculate_jaccard_index(targets1, targets2) elif options.formula == 'simpson': result_targets = diana_comparison.calculate_simpson_index(targets1, targets2) target_comparisons.append(result_targets) dc_to_target_ji[index] = result_targets bio_proc1 = get_results_from_dict_of_sets(targets1, target_to_bio_processes) bio_proc2 = get_results_from_dict_of_sets(targets2, target_to_bio_processes) if options.formula == 'jaccard': result_bp = diana_comparison.calculate_jaccard_index(bio_proc1, bio_proc2) elif options.formula == 'simpson': result_bp = diana_comparison.calculate_simpson_index(bio_proc1, bio_proc2) bp_comparisons.append(result_bp) dc_to_bp_ji[index] = result_bp pathways1 = get_results_from_dict_of_sets(targets1, target_to_pathways) pathways2 = get_results_from_dict_of_sets(targets2, target_to_pathways) if options.formula == 'jaccard': result_pathways = diana_comparison.calculate_jaccard_index(pathways1, pathways2) elif options.formula == 'simpson': result_pathways = diana_comparison.calculate_simpson_index(pathways1, pathways2) pathway_comparisons.append(result_pathways) dc_to_pathway_ji[index] = result_pathways # Plot distribution of comparisons of targets n, bins, patches = plt.hist(np.array(target_comparisons), bins=50, weights=np.zeros_like(np.array(target_comparisons)) + 1. / np.array(target_comparisons).size, facecolor='r') plt.xlabel('{} Index of Targets'.format(options.formula.capitalize())) plt.ylabel('Relative frequency') plt.title('Distribution of {} Index of Targets in drug combinations'.format(options.formula.capitalize())) plt.savefig(plot_ji_targets, format=fig_format, dpi=300) plt.clf() # Plot distribution of comparisons of biological processes n, bins, patches = plt.hist(np.array(bp_comparisons), bins=50, weights=np.zeros_like(np.array(bp_comparisons)) + 1. / np.array(bp_comparisons).size, facecolor='b') plt.xlabel('{} Index of Biological Processes'.format(options.formula.capitalize())) plt.ylabel('Relative frequency') plt.title('Distribution of {} Index of Biological Processes in drug combinations'.format(options.formula.capitalize())) plt.savefig(plot_ji_bp, format=fig_format, dpi=300) plt.clf() # Plot distribution of comparisons of pathways n, bins, patches = plt.hist(np.array(pathway_comparisons), bins=50, weights=np.zeros_like(np.array(pathway_comparisons)) + 1. / np.array(pathway_comparisons).size, facecolor='g') plt.xlabel('{} Index of Pathways'.format(options.formula.capitalize())) plt.ylabel('Relative frequency') plt.title('Distribution of {} Index of Pathways in drug combinations'.format(options.formula.capitalize())) plt.savefig(plot_ji_pathways, format=fig_format, dpi=300) plt.clf() #------------------------------------# # CLASSIFY THE DRUG COMBINATIONS # #------------------------------------# # Similar targets --> ji > 0.25 # Different targets --> ji <= 0.25 target_cut_off = 0.5 # Similar biological processes --> ji >= 0.25 # Different biological processes --> ji < 0.25 bp_cut_off = 0.5 # Similar pathways --> ji >= 0.5 # Different pathways --> ji < 0.5 pathway_cut_off = 0.5 classification_tar_bp = {} st = 0 dt = 0 st_sbp = 0 st_dbp = 0 dt_sbp = 0 dt_dbp = 0 for dc in dc_to_target_ji: # Classify by targets and biological processes if dc in dc_to_bp_ji: ji_tar = dc_to_target_ji[dc] ji_bp = dc_to_bp_ji[dc] if ji_tar > target_cut_off: classification_tar_bp[dc] = 'similar_targets' st += 1 if ji_bp > bp_cut_off: st_sbp += 1 elif ji_bp <= bp_cut_off: st_dbp += 1 elif ji_tar <= target_cut_off: dt += 1 if ji_bp > bp_cut_off: dt_sbp += 1 classification_tar_bp[dc] = 'different_targets_similar_bp' elif ji_bp <= bp_cut_off: dt_dbp += 1 classification_tar_bp[dc] = 'different_targets_different_bp' print('Similar targets {}: similar bp {}, diff bp {}\n'.format(st, st_sbp, st_dbp)) print('Different targets {}: similar bp {}, diff bp {}\n'.format(dt, dt_sbp, dt_dbp)) cPickle.dump(classification_tar_bp, open(classification_targets_bp_file, 'w')) classification_tar_pathway = {} st = 0 dt = 0 st_spath = 0 st_dpath = 0 dt_spath = 0 dt_dpath = 0 for dc in dc_to_target_ji: # Classify by targets and biological processes if dc in dc_to_pathway_ji: ji_tar = dc_to_target_ji[dc] ji_path = dc_to_pathway_ji[dc] if ji_tar > target_cut_off: classification_tar_pathway[dc] = 'similar_targets' st += 1 if ji_path > pathway_cut_off: st_spath += 1 elif ji_path <= pathway_cut_off: st_dpath += 1 elif ji_tar <= target_cut_off: dt += 1 if ji_path > pathway_cut_off: dt_spath += 1 classification_tar_pathway[dc] = 'different_targets_similar_pathways' elif ji_path <= pathway_cut_off: dt_dpath += 1 classification_tar_pathway[dc] = 'different_targets_different_pathways' print('Similar targets {}: similar pathways {}, diff pathways {}\n'.format(st, st_spath, st_dpath)) print('Different targets {}: similar pathways {}, diff pathways {}\n'.format(dt, dt_spath, dt_dpath)) cPickle.dump(classification_tar_pathway, open(classification_targets_pathways_file, 'w')) # Get number of drugs in drug combinations per number of targets targets = [len(drugbank_to_targets[drug]) for drug in drugbank_to_targets if drug in all_drugs] numtargets_to_numdrugs = {} for target in targets: numtargets_to_numdrugs.setdefault(target, 0) numtargets_to_numdrugs[target] += 1 print('Number of drugs in drug combination: {}. Divided by four: {}'.format(len(all_drugs), len(all_drugs)/4)) for numtar, numdrug in sorted(numtargets_to_numdrugs.iteritems(), key=lambda (x, y): x, reverse = True): print(numtar, numdrug) # End marker for time end = time.time() print('\n DIANA INFO:\tTIME OF EXECUTION: {:.3f} seconds or {:.3f} minutes.\n'.format(end - start, (end - start) / 60)) return
def analysis_results(options): """ Analyzes the results of the comparisons """ # Start marker for time measure start = time.time() print("\n\t\t------------------------------------------------------------------------------------------------------------------------\n") print("\t\tStarting Drug Interactions ANAlysis (DIANA), a program created by @OLIVA'S LAB. Analysis of results: Analysis by targets\n") print("\t\t------------------------------------------------------------------------------------------------------------------------\n") # Get the script path main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) toolbox_dir = os.path.join(main_path, 'diana/toolbox') # Check the directory of the profiles, comparisons and analysis data_dir = os.path.join(options.workspace, "profiles") check_directory(data_dir) results_dir = os.path.join(options.workspace, "comparisons") check_directory(results_dir) analysis_dir = os.path.join(options.workspace, "analysis") check_directory(analysis_dir) # Get the list of thresholds to create the profiles if options.threshold_list and fileExist(options.threshold_list): threshold_list = get_values_from_threshold_file(options.threshold_list) else: threshold_list = [1, 5, 10, 20, 50] # Do we consider Side Effects/ATC? if options.consider_se: consider_se = True else: consider_se = False # Get the names of the columns columns = diana_analysis.obtain_columns(threshold_list, ATC_SE=consider_se) #-----------------------------------------------------# # PARSE THE RESULTS AND CREATE A PANDAS DATAFRAME # #-----------------------------------------------------# pair2comb_file = os.path.join(toolbox_dir, 'pair2comb.pcl') pair2comb = cPickle.load(open(pair2comb_file)) ddi = sum(1 for x in pair2comb.values() if x == 1) non_ddi = sum(1 for x in pair2comb.values() if x == 0) print('NUMBER OF DRUG COMBINATIONS:\t\t{}\n'.format(ddi)) print('NUMBER OF NON-DRUG COMBINATIONS:\t{}\n'.format(non_ddi)) output_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons.csv') if not fileExist(output_dataframe): # Create a data frame to store the results df = pd.DataFrame(columns=columns) # Obtain all the results subfolders of the results main folder results_dir_list = [f for f in os.listdir(results_dir) if os.path.isdir(os.path.join(results_dir, f))] for comparison in results_dir_list: drug_id1, drug_id2 = comparison.split('---') comparison_dir = os.path.join(results_dir, comparison) results_table = os.path.join(comparison_dir, 'results_table.tsv') # Add the Comb field (if it is drug combination or not) drug1 = drug_id1.split('_')[0].upper() drug2 = drug_id2.split('_')[0].upper() comparison_without_id = '{}---{}'.format(drug1, drug2) if comparison_without_id in pair2comb: combination_field = pair2comb[comparison_without_id] else: print('The comparison {} is not in the pair2comb dictionary!\n'.format(comparison_without_id)) print(pair2comb) sys.exit(10) if not fileExist(results_table): print('The comparison {} has not been executed properly!\n'.format(comparison)) sys.exit(10) results = get_results_from_table(results_table, columns, combination_field) df2 = pd.DataFrame([results], columns=columns, index=[comparison]) # Add the information to the main data frame df = df.append(df2) # Output the Pandas dataframe in a CSV file df.to_csv(output_dataframe) else: df = pd.read_csv(output_dataframe, index_col=0) #---------------------------# # REMOVE MISSING VALUES # #---------------------------# # Replace the None values in dcstructure by nan if 'None' in df['dcstructure']: df = df.replace(to_replace={'dcstructure':{'None':np.nan}}) # Remove the nan values in dcstructure df = df.dropna() # Count the number of drug combinations / non-drug combinations dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print('Number of drug combinations after removing missing values:\t{}\n'.format(num_dc)) print('Number of non-drug combinations after removing missing values:\t{}\n'.format(num_ndc)) #---------------------------# # IDENTIFY ME-TOO DRUGS # #---------------------------# me_too_dir = os.path.join(analysis_dir, 'me_too_drugs') create_directory(me_too_dir) me_too_drugs_table = os.path.join(me_too_dir, 'me_too_drugs.tsv') me_too_drug_combs_table = os.path.join(me_too_dir, 'me_too_drug_combinations.tsv') me_too_drug_pairs_file = os.path.join(me_too_dir, 'me_too_drug_pairs.pcl') me_too_drug_comb_pairs_file = os.path.join(me_too_dir, 'me_too_drug_comb_pairs.pcl') if not fileExist(me_too_drug_pairs_file) or not fileExist(me_too_drug_comb_pairs_file): df_struc = df[['dcstructure']] df_struc = df_struc.astype(float) me_too_drug_pairs, me_too_drug_comb_pairs = diana_analysis.obtain_me_too_drugs_and_combinations(df_struc, columns, me_too_drugs_table, me_too_drug_combs_table) cPickle.dump(me_too_drug_pairs, open(me_too_drug_pairs_file, 'w')) cPickle.dump(me_too_drug_comb_pairs, open(me_too_drug_comb_pairs_file, 'w')) else: me_too_drug_pairs = cPickle.load(open(me_too_drug_pairs_file)) me_too_drug_comb_pairs = cPickle.load(open(me_too_drug_comb_pairs_file)) # Process me-too drug combination pairs me_too_drug_combinations = set() drug_pair_to_me_too_times = {} for pair in me_too_drug_comb_pairs: drug_comb1, drug_comb2 = pair.split('___') me_too_drug_combinations.add(frozenset([drug_comb1, drug_comb2])) drug_pair_to_me_too_times.setdefault(drug_comb1, 0) drug_pair_to_me_too_times.setdefault(drug_comb2, 0) drug_pair_to_me_too_times[drug_comb1] += 1 drug_pair_to_me_too_times[drug_comb2] += 1 removed_drug_pairs = set() for pair in me_too_drug_comb_pairs: drug_comb1, drug_comb2 = pair.split('___') if drug_comb1 in removed_drug_pairs or drug_comb2 in removed_drug_pairs: continue if drug_pair_to_me_too_times[drug_comb1] > drug_pair_to_me_too_times[drug_comb2]: removed_drug_pairs.add(drug_comb1) else: removed_drug_pairs.add(drug_comb2) # Remove the drug pairs which appear in me-too pairs of drug pairs more times df = df.loc[~df.index.isin(list(removed_drug_pairs))] # Count the number of drug combinations / non-drug combinations dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print('Number of drug combinations after removing me-too conflictive drug pairs:\t{}\n'.format(num_dc)) print('Number of non-drug combinations after removing me-too conflictive drug pairs:\t{}\n'.format(num_ndc)) #-------------------------------------# # EVALUATE PERFORMANCE BY TARGETS # #-------------------------------------# img_dir = os.path.join(analysis_dir, 'figures') create_directory(img_dir) fig_format = 'png' tables_dir = os.path.join(analysis_dir, 'tables') create_directory(tables_dir) # Number of targets num_targets = [[1],[2],[3,4,5,6],[7]] # Names of the methods if consider_se: if options.different_atc: types_analysis = ['dctargets', 'dcguild', 'dcstructure', 'dcse', 'random'] types_analysis2 = ['dctargets', 'dcguild', 'dcstructure', 'dcse'] # Without random!! #types_analysis_labels = ['dcTargets', 'dcGUILD', 'dcStructure', 'dcSE', 'Random'] types_analysis_labels = [ 'Target', 'PPI','Structure', 'Side Effects', 'Random'] else: types_analysis = ['dctargets', 'dcguild', 'dcstructure', 'dcatc', 'dcse', 'random'] types_analysis2 = ['dctargets', 'dcguild', 'dcstructure', 'dcatc', 'dcse'] # Without random!! #types_analysis_labels = ['dcTargets', 'dcGUILD', 'dcStructure', 'dcATC', 'dcSE', 'Random'] types_analysis_labels = [ 'Target', 'PPI','Structure', 'ATC', 'Side Effects', 'Random'] else: types_analysis = ['dctargets', 'dcguild', 'dcstructure', 'random'] types_analysis2 = ['dctargets', 'dcguild', 'dcstructure'] # Without random!! types_analysis_labels = ['dcTargets', 'dcGUILD', 'dcStructure', 'Random'] types_analysis_labels = [ 'Target', 'PPI','Structure', 'Random'] # Machine learning parameters repetitions = 25 # Number of repetititons n_fold = 2 # Number of folds min_num_dc_group = 10 greater_or_smaller = 'greater' classifier = 'SVC best 1' classifiers = { 'KNeighbors' : KNeighborsClassifier(3), 'SVC' : SVC(probability=True), 'SVC linear' : SVC(kernel="linear", C=0.025), 'SVC rbf' : SVC(gamma=2, C=1), 'DecisionTree' : DecisionTreeClassifier(max_depth=5), 'RandomForest' : RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 'MLP' : MLPClassifier(alpha=1), 'AdaBoost' : AdaBoostClassifier(), 'GaussianNB' : GaussianNB(), 'QuadraticDiscr.' : QuadraticDiscriminantAnalysis(), 'SVC best 1' : SVC(kernel="rbf", gamma=0.01, C=100, probability=True), 'SVC best 2' : SVC(kernel="rbf", gamma=0.1, C=1.0, probability=True) } if options.pca: pca_str = '_withPCA' else: pca_str = '_withoutPCA' # Plot of distributions of AUC plot_auc_distribution = os.path.join(img_dir, 'numtargets_auc_distribution_ranges{}.{}'.format(pca_str, fig_format)) # Plot of accuracy/sensitivity name acc_sens_dctargets = os.path.join(img_dir, 'numtargets_accsens_dctargets_ranges{}.{}'.format(pca_str, fig_format)) acc_sens_dcguild = os.path.join(img_dir, 'numtargets_accsens_dcguild_ranges{}.{}'.format(pca_str, fig_format)) acc_sens_dcstructure = os.path.join(img_dir, 'numtargets_accsens_dcstructure_ranges{}.{}'.format(pca_str, fig_format)) acc_sens_dcatc = os.path.join(img_dir, 'numtargets_accsens_dcatc_ranges{}.{}'.format(pca_str, fig_format)) acc_sens_dcse = os.path.join(img_dir, 'numtargets_accsens_dcse_ranges{}.{}'.format(pca_str, fig_format)) # Results table results_table = os.path.join(tables_dir, 'numtargets_auc_table_ranges{}.txt'.format(pca_str)) # Accuracy/Sensitivity results table prec_rec_table = os.path.join(tables_dir, 'numtargets_accsens_table_ranges{}.txt'.format(pca_str)) # File with results of Mann Whitney tests mannwhitney_file = os.path.join(tables_dir, 'numtargets_mannwhitney_ranges{}.txt'.format(pca_str)) # Get the targets file drugbank_to_targets_file = os.path.join(toolbox_dir, 'drugbank_to_targets.pcl') drugbank_to_targets = cPickle.load(open(drugbank_to_targets_file)) # Get the DIANA IDs file diana_id_to_drugbank_file = os.path.join(toolbox_dir, 'diana_id_to_drugbank.pcl') diana_id_to_drugbank = cPickle.load(open(diana_id_to_drugbank_file)) analysis_results = {} # Defining the dictionary that will store the results if consider_se: dct_columns, dcg_columns, dcs_columns, dcatc_columns, dcse_columns = diana_analysis.obtain_method_to_columns(threshold_list, ATC_SE=consider_se) else: dct_columns, dcg_columns, dcs_columns = diana_analysis.obtain_method_to_columns(threshold_list, ATC_SE=consider_se) for range_tar in num_targets: selected_rows = [] for index, row in df.iterrows(): (drug_id1, drug_id2) = index.split('---') drug1 = diana_id_to_drugbank[drug_id1].upper() drug2 = diana_id_to_drugbank[drug_id2].upper() if len(range_tar) == 1: # If it is the first of the range if range_tar == num_targets[0]: if len(drugbank_to_targets[drug1]) <= range_tar[0] and len(drugbank_to_targets[drug2]) <= range_tar[0]: selected_rows.append(index) # If it is the last of the range elif range_tar == num_targets[len(num_targets)-1]: if len(drugbank_to_targets[drug1]) >= range_tar[0] and len(drugbank_to_targets[drug2]) >= range_tar[0]: selected_rows.append(index) # If it is in the middle of the range else: if len(drugbank_to_targets[drug1]) == range_tar[0] and len(drugbank_to_targets[drug2]) == range_tar[0]: selected_rows.append(index) else: if len(drugbank_to_targets[drug1]) in range_tar and len(drugbank_to_targets[drug2]) in range_tar: selected_rows.append(index) df_tar = df.ix[selected_rows] dc_data = df_tar[df_tar['combination'] == 1] num_dc = len(dc_data.index) print('Num drug combinations: {}'.format(num_dc)) if consider_se: list_methods = [ ['dctargets', dct_columns], ['dcguild', dcg_columns], ['dcstructure', dcs_columns], ['dcatc', dcatc_columns], ['dcse', dcse_columns], ['random', columns] ] else: list_methods = [ ['dctargets', dct_columns], ['dcguild', dcg_columns], ['dcstructure', dcs_columns], ['random', columns] ] for method, columns_method in list_methods: print('Evaluating {} targets with method {}\n'.format(range_tar,method)) #------------------------------------------------------------------# # SELECT RELEVANT FEATURES / REDUCE DIMENSIONALITY OF THE DATA # #------------------------------------------------------------------# if options.pca: variance_cut_off = 0.01 num_components = 0 df_method = df_tar[columns_method] df_raw = df_method.drop('combination', axis=1) raw_columns = copy.copy(columns_method) raw_columns.remove('combination') pca = PCA(n_components=None) pca.fit(df_raw) values_trans = pca.transform(df_raw) explained_variance = pca.explained_variance_ratio_ for column, var in sorted(zip(raw_columns, explained_variance), key=lambda x: x[1], reverse=True): #print(column, var) if var > variance_cut_off: num_components += 1 if num_components < len(raw_columns): print('Number of features:\t{}\n'.format(len(raw_columns))) print('Reduction to {} components\n'.format(num_components)) pca = PCA(n_components=num_components) pca.fit(df_raw) values_trans = pca.transform(df_raw) indexes = df_method.index.values df_trans = pd.DataFrame.from_records(values_trans, index=indexes) df_comb = df_method[['combination']] df_new = pd.concat([df_trans, df_comb], axis=1) df_method = df_new else: # Manually introduced features guild_thresholds = [1, 5] rank_scoring = ['spearman', 'dot_product'] list_scoring = ['jaccard'] if method == 'Combination' or method == 'random': selected_columns = diana_analysis.obtain_columns_best_features(guild_thresholds, rank_scoring, list_scoring, ATC_SE=consider_se) else: selected_columns = diana_analysis.obtain_columns_best_features_for_specific_method(method, guild_thresholds, rank_scoring, list_scoring) # Remove ATC columns if different ATC if options.different_atc and consider_se: selected_columns = [col for col in selected_columns if col not in dcatc_columns or col == 'combination'] print('Selected columns: {}\n'.format(', '.join(selected_columns))) print('Number of selected features: {}\n'.format(len(selected_columns)-1)) # We take away the combinations column # Define the new table with the selected columns df_method = df_tar[selected_columns] dc_data = df_method[df_method['combination'] == 1] ndc_data = df_method[df_method['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) #------------------------------------------------------------------# dc_data = df_method[df_method['combination'] == 1] ndc_data = df_method[df_method['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print('Building {} repetition groups of {} (same) DC and {} (different) non-DC'.format(repetitions,num_dc,num_dc)) ndc_repetitions = diana_analysis.obtain_n_groups_of_k_length(ndc_data, repetitions, num_dc) # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times mean_aucs = [] # Here we will store the means of AUCs from the cross-validations std_aucs = [] # Here we will store the standard deviations of the AUCs from the cross-validations all_aucs = [] # Here we will store ALL the AUCs all_probs = [] # Here we store all the probabilities and labels num_repetitions=0 for ndc_data_equal in ndc_repetitions: num_repetitions+=1 num_items_group = int( float(num_dc) / float(n_fold) ) # Calculate the number of items in each group of the cross-validation if num_repetitions == 1: print('Building {} fold groups of {} DC and {} non-DC x {} repetitions'.format(n_fold,num_items_group,num_items_group, repetitions)) dc_groups = diana_analysis.obtain_n_groups_of_k_length(dc_data, n_fold, num_items_group, me_too_drug_combinations) # Defining the drug combination groups in each cross-validation step ndc_groups = diana_analysis.obtain_n_groups_of_k_length(ndc_data_equal, n_fold, num_items_group, me_too_drug_combinations) # Defining the non-drug combination groups in each cross-validation step merged_groups = [pd.concat([x,y]) for x,y in zip(dc_groups, ndc_groups)] if method == 'random': #mean, var, std, list_auc = run_nfold_crossvalidation_random(n_fold, merged_groups, classifiers[classifier]) mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_dummy(n_fold, merged_groups, classifiers[classifier]) else: mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_scikit_with_prob(n_fold, merged_groups, classifiers[classifier]) mean_aucs.append(mean) std_aucs.append(std) all_aucs = all_aucs + list_auc all_probs = all_probs + list_prob final_mean = np.mean(all_aucs) #final_mean = np.mean(mean_aucs) std = np.std(all_aucs) mean_std = np.mean(std_aucs) std_means = np.std(mean_aucs) print('FINAL MEAN: {}'.format(final_mean)) print('STD: {}\n'.format(std)) #print('MEAN of STD: {}'.format(mean_std)) # Store the distribution of AUCs in the dictionary analysis_results.setdefault(range_tar[0], {}) analysis_results[range_tar[0]].setdefault(method, {}) analysis_results[range_tar[0]][method]['all_aucs'] = all_aucs analysis_results[range_tar[0]][method]['all_probs'] = all_probs analysis_results[range_tar[0]][method]['mean'] = final_mean analysis_results[range_tar[0]][method]['std'] = std analysis_results[range_tar[0]][method]['num_dc'] = num_dc #------------------------------------# # PLOT PRECISION VS. SENSITIVITY # #------------------------------------# analysis_results = plot_precision_sensitivity(analysis_results, 'dctargets', num_targets, acc_sens_dctargets) analysis_results = plot_precision_sensitivity(analysis_results, 'dcguild', num_targets, acc_sens_dcguild) analysis_results = plot_precision_sensitivity(analysis_results, 'dcstructure', num_targets, acc_sens_dcstructure) if consider_se: analysis_results = plot_precision_sensitivity(analysis_results, 'dcatc', num_targets, acc_sens_dcatc) analysis_results = plot_precision_sensitivity(analysis_results, 'dcse', num_targets, acc_sens_dcse) #----------------------------------------------------# # PLOT DISTRIBUTION OF AUC PER NUMBER OF TARGETS # #----------------------------------------------------# plot_auc_distributions(analysis_results, num_targets, types_analysis, types_analysis_labels, plot_auc_distribution, fig_format=fig_format, consider_se=consider_se) #--------------------------------------------------------# # TABLE OF DISTRIBUTION OF AUC PER NUMBER OF TARGETS # #--------------------------------------------------------# with open(results_table, 'w') as results_table_fd: # Header results_table_fd.write(' ') for method in types_analysis_labels: results_table_fd.write('\t{}\t \t '.format(method)) results_table_fd.write('\n') for num in num_targets: results_table_fd.write('{}'.format(num)) for method in types_analysis: mean = analysis_results[num[0]][method]['mean'] std = analysis_results[num[0]][method]['std'] num_dc = analysis_results[num[0]][method]['num_dc'] results_table_fd.write('\t{}\t{}\t{}'.format(mean, std, num_dc)) results_table_fd.write('\n') #----------------------------------------# # TABLE OF PRECISION VS. SENSITIVITY # #----------------------------------------# with open(prec_rec_table, 'w') as prec_rec_table_fd: # Header prec_rec_table_fd.write(' ') for method in types_analysis2: prec_rec_table_fd.write('\t{}\t '.format(method)) prec_rec_table_fd.write('\n') for num in num_targets: prec_rec_table_fd.write('{}'.format(num)) for method in types_analysis2: cut_off = analysis_results[num[0]][method]['cut_off'] value = analysis_results[num[0]][method]['value'] prec_rec_table_fd.write('\t{}\t{}'.format(cut_off, value)) prec_rec_table_fd.write('\n') #-------------------------------------------------------------------# # TABLE OF COMPARISON OF AUC DISTRIBUTIONS USING MANN WHITNEY U # #-------------------------------------------------------------------# with open(mannwhitney_file, 'w') as mannwhitney_fd: mann_results = {} mannwhitney_fd.write(' \t ') for method in types_analysis_labels: mannwhitney_fd.write('\t{}'.format(method)) mannwhitney_fd.write('\n') # Perform the comparisons for num in num_targets: mann_results.setdefault(num[0], {}) for method1 in types_analysis: mann_results[num[0]].setdefault(method1, {}) for method2 in types_analysis: if method1 == method2: mann_results[num[0]][method1][method2] = '-' else: method1_dist = analysis_results[num[0]][method1]['all_aucs'] method2_dist = analysis_results[num[0]][method2]['all_aucs'] stat, pval = scipy.stats.mannwhitneyu(method1_dist, method2_dist) mann_results[num[0]][method1][method2] = [stat, pval] # Write the table of crossings for num in num_targets: for method1 in types_analysis: mannwhitney_fd.write('{}\t{}'.format(num[0], method1)) for method2 in types_analysis: if method1 == method2: mannwhitney_fd.write('\t-') else: stat, pval = mann_results[num[0]][method1][method2] mannwhitney_fd.write('\t{}, {:.2e}'.format(stat,pval)) mannwhitney_fd.write('\n') # End marker for time end = time.time() print('\n DIANA INFO:\tTIME OF EXECUTION: {:.3f} seconds or {:.3f} minutes.\n'.format(end - start, (end - start) / 60)) return
def analysis_results(options): """ Analyzes the results of the comparisons """ # Start marker for time measure start = time.time() print( "\n\t\t------------------------------------------------------------------------------------------------------------------------\n" ) print( "\t\tStarting Drug Interactions ANAlysis (DIANA), a program created by @OLIVA'S LAB. Analysis of results: Analysis by targets\n" ) print( "\t\t------------------------------------------------------------------------------------------------------------------------\n" ) # Get the script path main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) toolbox_dir = os.path.join(main_path, 'diana/toolbox') # Check the directory of the profiles, comparisons and analysis data_dir = os.path.join(options.workspace, "profiles") check_directory(data_dir) results_dir = os.path.join(options.workspace, "comparisons") check_directory(results_dir) analysis_dir = os.path.join(options.workspace, "analysis") check_directory(analysis_dir) # Create a directory for the analysis of the comparison with other methods if options.comparison_other_methods: analysis_dir = os.path.join(options.workspace, "analysis_comparison") create_directory(analysis_dir) # Get the list of thresholds to create the profiles if options.threshold_list and fileExist(options.threshold_list): threshold_list = get_values_from_threshold_file(options.threshold_list) else: threshold_list = [1, 5, 10, 20, 50] # Do we consider Side Effects/ATC? if options.consider_se: consider_se = True else: consider_se = False # Make a cross-validation with the validation set (True) # or make a training with the training and a validation with the validation (False) cross_validation = False # Get the names of the columns columns = diana_analysis.obtain_columns(threshold_list, ATC_SE=consider_se) #-----------------------------------------------------# # PARSE THE RESULTS AND CREATE A PANDAS DATAFRAME # #-----------------------------------------------------# pair2comb_file = os.path.join(toolbox_dir, 'pair2comb.pcl') pair2comb = cPickle.load(open(pair2comb_file)) ddi = sum(1 for x in pair2comb.values() if x == 1) non_ddi = sum(1 for x in pair2comb.values() if x == 0) print('NUMBER OF DRUG COMBINATIONS:\t\t{}\n'.format(ddi)) print('NUMBER OF NON-DRUG COMBINATIONS:\t{}\n'.format(non_ddi)) output_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons.csv') # Change the name of the output file if we are doing a comparison with other methods if options.comparison_other_methods: output_dataframe = os.path.join(analysis_dir, 'comparison_other_methods.csv') if not fileExist(output_dataframe): # Create a data frame to store the results df = pd.DataFrame(columns=columns) # Obtain all the results subfolders of the results main folder results_dir_list = [ f for f in os.listdir(results_dir) if os.path.isdir(os.path.join(results_dir, f)) ] for comparison in results_dir_list: drug_id1, drug_id2 = comparison.split('---') comparison_dir = os.path.join(results_dir, comparison) results_table = os.path.join(comparison_dir, 'results_table.tsv') # Add the Comb field (if it is drug combination or not) drug1 = drug_id1.split('_')[0].upper() drug2 = drug_id2.split('_')[0].upper() comparison_without_id = '{}---{}'.format(drug1, drug2) if comparison_without_id in pair2comb: combination_field = pair2comb[comparison_without_id] else: print( 'The comparison {} is not in the pair2comb dictionary!\n'. format(comparison_without_id)) print(pair2comb) sys.exit(10) if not fileExist(results_table): print('The comparison {} has not been executed properly!\n'. format(comparison)) sys.exit(10) results = get_results_from_table(results_table, columns, combination_field) df2 = pd.DataFrame([results], columns=columns, index=[comparison]) # Add the information to the main data frame df = df.append(df2) # Output the Pandas dataframe in a CSV file df.to_csv(output_dataframe) else: df = pd.read_csv(output_dataframe, index_col=0) #---------------------------# # REMOVE MISSING VALUES # #---------------------------# # Replace the None values in dcstructure by nan if 'None' in df['dcstructure']: df = df.replace(to_replace={'dcstructure': {'None': np.nan}}) # Remove the nan values in dcstructure df = df.dropna() # Count the number of drug combinations / non-drug combinations dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print('Number of drug combinations after removing missing values:\t{}\n'. format(num_dc)) print( 'Number of non-drug combinations after removing missing values:\t{}\n'. format(num_ndc)) #---------------------------# # IDENTIFY ME-TOO DRUGS # #---------------------------# me_too_dir = os.path.join(analysis_dir, 'me_too_drugs') create_directory(me_too_dir) me_too_drugs_table = os.path.join(me_too_dir, 'me_too_drugs.tsv') me_too_drug_combs_table = os.path.join(me_too_dir, 'me_too_drug_combinations.tsv') me_too_drug_pairs_file = os.path.join(me_too_dir, 'me_too_drug_pairs.pcl') me_too_drug_comb_pairs_file = os.path.join(me_too_dir, 'me_too_drug_comb_pairs.pcl') if not fileExist(me_too_drug_pairs_file) or not fileExist( me_too_drug_comb_pairs_file): df_struc = df[['dcstructure']] df_struc = df_struc.astype(float) me_too_drug_pairs, me_too_drug_comb_pairs = diana_analysis.obtain_me_too_drugs_and_combinations( df_struc, columns, me_too_drugs_table, me_too_drug_combs_table) cPickle.dump(me_too_drug_pairs, open(me_too_drug_pairs_file, 'w')) cPickle.dump(me_too_drug_comb_pairs, open(me_too_drug_comb_pairs_file, 'w')) else: me_too_drug_pairs = cPickle.load(open(me_too_drug_pairs_file)) me_too_drug_comb_pairs = cPickle.load( open(me_too_drug_comb_pairs_file)) # Process me-too drug combination pairs me_too_drug_combinations = set() drug_pair_to_me_too_times = {} for pair in me_too_drug_comb_pairs: drug_comb1, drug_comb2 = pair.split('___') me_too_drug_combinations.add(frozenset([drug_comb1, drug_comb2])) drug_pair_to_me_too_times.setdefault(drug_comb1, 0) drug_pair_to_me_too_times.setdefault(drug_comb2, 0) drug_pair_to_me_too_times[drug_comb1] += 1 drug_pair_to_me_too_times[drug_comb2] += 1 removed_drug_pairs = set() for pair in me_too_drug_comb_pairs: drug_comb1, drug_comb2 = pair.split('___') if drug_comb1 in removed_drug_pairs or drug_comb2 in removed_drug_pairs: continue if drug_pair_to_me_too_times[drug_comb1] > drug_pair_to_me_too_times[ drug_comb2]: removed_drug_pairs.add(drug_comb1) else: removed_drug_pairs.add(drug_comb2) # Remove the drug pairs which appear in me-too pairs of drug pairs more times df = df.loc[~df.index.isin(list(removed_drug_pairs))] # Count the number of drug combinations / non-drug combinations dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print( 'Number of drug combinations after removing me-too conflictive drug pairs:\t{}\n' .format(num_dc)) print( 'Number of non-drug combinations after removing me-too conflictive drug pairs:\t{}\n' .format(num_ndc)) #----------------------------# # GET THE VALIDATION SET # #----------------------------# training_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons_training.csv') validation_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons_validation.csv') df_training = pd.read_csv(training_dataframe, index_col=0) df_validation = pd.read_csv(validation_dataframe, index_col=0) if cross_validation: df = df_validation dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print( 'Number of drug combinations after getting the validation dataset:\t{}\n' .format(num_dc)) print( 'Number of non-drug combinations after getting the validation dataset:\t{}\n' .format(num_ndc)) else: # Define the variables for the training dataset df = df_training dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print( 'Number of drug combinations after getting the training dataset:\t{}\n' .format(num_dc)) print( 'Number of non-drug combinations after getting the training dataset:\t{}\n' .format(num_ndc)) # Define the variables for the validation dataset dc_data_val = df_validation[df_validation['combination'] == 1] ndc_data_val = df_validation[df_validation['combination'] == 0] num_dc_val = len(dc_data_val.index) num_ndc_val = len(ndc_data_val.index) print( 'Number of drug combinations after getting the validation dataset:\t{}\n' .format(num_dc_val)) print( 'Number of non-drug combinations after getting the validation dataset:\t{}\n' .format(num_ndc_val)) #-------------------------# # EVALUATE PERFORMANCE # #-------------------------# img_dir = os.path.join(analysis_dir, 'figures') create_directory(img_dir) fig_format = 'png' tables_dir = os.path.join(analysis_dir, 'tables') create_directory(tables_dir) # Machine learning parameters repetitions = 25 # Number of repetititons n_fold = 10 # Number of folds min_num_dc_group = 10 classifier = 'SVC best 1' classifiers = { 'KNeighbors': KNeighborsClassifier(3), 'SVC': SVC(probability=True), 'SVC linear': SVC(kernel="linear", C=0.025), 'SVC rbf': SVC(gamma=2, C=1), 'DecisionTree': DecisionTreeClassifier(max_depth=5), 'RandomForest': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 'MLP': MLPClassifier(alpha=1), 'AdaBoost': AdaBoostClassifier(), 'GaussianNB': GaussianNB(), 'QuadraticDiscr.': QuadraticDiscriminantAnalysis(), 'SVC best 1': SVC(kernel="rbf", gamma=0.01, C=100, probability=True), 'SVC best 2': SVC(kernel="rbf", gamma=0.1, C=1.0, probability=True) } if options.pca: pca_str = '_withPCA' else: pca_str = '_withoutPCA' if options.different_atc: atc_str = '_diff_ATC' else: atc_str = '' # Plot of distributions of AUC plot_name = os.path.join( img_dir, 'general_performance_by_methods{}{}.{}'.format(atc_str, pca_str, fig_format)) # Get the targets file drugbank_to_targets_file = os.path.join(toolbox_dir, 'drugbank_to_targets.pcl') drugbank_to_targets = cPickle.load(open(drugbank_to_targets_file)) # Get the ATC file drugbank_to_atcs_file = os.path.join(toolbox_dir, 'drugbank_to_atcs.pcl') drugbank_to_atcs = cPickle.load(open(drugbank_to_atcs_file)) # Get the DIANA IDs file diana_id_to_drugbank_file = os.path.join(toolbox_dir, 'diana_id_to_drugbank.pcl') diana_id_to_drugbank = cPickle.load(open(diana_id_to_drugbank_file)) print('\nEVALUATION OF GENERAL PERFORMANCE\n') repetitions = 25 n_fold = 10 analysis_results = {} method_to_results = {} method_to_probs = {} # Get columns for each method if consider_se: dct_columns, dcg_columns, dcs_columns, dcatc_columns, dcse_columns = diana_analysis.obtain_method_to_columns( threshold_list, ATC_SE=consider_se) else: dct_columns, dcg_columns, dcs_columns = diana_analysis.obtain_method_to_columns( threshold_list, ATC_SE=consider_se) # Remove ATC columns if different ATC if options.different_atc: columns = [ col for col in columns if col not in dcatc_columns or col == 'combination' ] if consider_se: if options.different_atc: list_methods = [['Combination', columns], ['dctargets', dct_columns], ['dcguild', dcg_columns], ['dcstructure', dcs_columns], ['dcatc', dcatc_columns], ['dcse', dcse_columns], ['random', columns]] methods_ordered = [ 'Combination', 'dctargets', 'dcguild', 'dcstructure', 'dcatc', 'dcse', 'random' ] method_to_label = { 'Combination': 'All', 'dctargets': 'Target', 'dcguild': 'PPI', 'dcstructure': 'Structure', 'dcatc': 'ATC', 'dcse': 'Side Effects', 'random': 'Random' } colors_ordered = [['yellow', 'black'], ['#ff7373', 'red'], ['#32f232', 'green'], ['#4f4f4f', 'black'], ['#e59600', '#966200'], ['#aeaeae', 'black'] ] # yellow, red, green, black, orange, grey else: list_methods = [['Combination', columns], ['dctargets', dct_columns], ['dcguild', dcg_columns], ['dcstructure', dcs_columns], ['dcatc', dcatc_columns], ['dcse', dcse_columns], ['random', columns]] methods_ordered = [ 'Combination', 'dctargets', 'dcguild', 'dcstructure', 'dcatc', 'dcse', 'random' ] method_to_label = { 'Combination': 'All', 'dctargets': 'Target', 'dcguild': 'PPI', 'dcstructure': 'Structure', 'dcatc': 'ATC', 'dcse': 'Side Effects', 'random': 'Random' } colors_ordered = [ ['yellow', 'black'], ['#ff7373', 'red'], ['#32f232', 'green'], ['#4f4f4f', 'black'], ['#22a9bd', '#0049e5'], ['#e59600', '#966200'], ['#aeaeae', 'black'] ] # yellow, red, green, black, blue, orange, grey else: list_methods = [['Combination', columns], ['dctargets', dct_columns], ['dcguild', dcg_columns], ['dcstructure', dcs_columns], ['random', columns]] methods_ordered = [ 'Combination', 'dctargets', 'dcguild', 'dcstructure', 'random' ] method_to_label = { 'Combination': 'All', 'dctargets': 'Target', 'dcguild': 'PPI', 'dcstructure': 'Structure', 'random': 'Random' } colors_ordered = [['white', 'black'], ['#ff7373', 'red'], ['#32f232', 'green'], ['#4f4f4f', 'black'], ['#aeaeae', 'black']] # white, red, green, black, grey #-------------------------------------------------# # SELECT DRUG COMBINATIONS WITH DIFFERENT ATC # #-------------------------------------------------# if options.different_atc: selected_rows = [] for index, row in df.iterrows(): (drug_id1, drug_id2) = index.split('---') drug1 = diana_id_to_drugbank[drug_id1].upper() drug2 = diana_id_to_drugbank[drug_id2].upper() atcs_drug1 = set([atc[0] for atc in drugbank_to_atcs[drug1]]) atcs_drug2 = set([atc[0] for atc in drugbank_to_atcs[drug2]]) intersection = atcs_drug1 & atcs_drug2 if len(intersection) == 0: selected_rows.append(index) df = df.ix[selected_rows] dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print( 'Num drug combinations after removing the ones with same ATC in training: {}' .format(num_dc)) print( 'Num non-drug combinations after removing the ones with same ATC in training: {}' .format(num_ndc)) selected_rows = [] for index, row in df_validation.iterrows(): (drug_id1, drug_id2) = index.split('---') drug1 = diana_id_to_drugbank[drug_id1].upper() drug2 = diana_id_to_drugbank[drug_id2].upper() atcs_drug1 = set([atc[0] for atc in drugbank_to_atcs[drug1]]) atcs_drug2 = set([atc[0] for atc in drugbank_to_atcs[drug2]]) intersection = atcs_drug1 & atcs_drug2 if len(intersection) == 0: selected_rows.append(index) df_validation = df_validation.ix[selected_rows] dc_data_val = df_validation[df_validation['combination'] == 1] ndc_data_val = df_validation[df_validation['combination'] == 0] num_dc_val = len(dc_data_val.index) num_ndc_val = len(ndc_data_val.index) print( 'Num drug combinations (in validation) after removing the ones with same ATC in training: {}' .format(num_dc_val)) print( 'Num non-drug combinations (in validation) after removing the ones with same ATC in training: {}' .format(num_ndc_val)) #--------------------------# # EVALUATE EACH METHOD # #--------------------------# for method, columns_method in list_methods: print('Evaluating method {}\n'.format(method)) #------------------------------------------------------------------# # SELECT RELEVANT FEATURES / REDUCE DIMENSIONALITY OF THE DATA # #------------------------------------------------------------------# if options.pca: # Strategy: # We calculate the explained variance ratio for all the features. # We define a a cut-off threshold of the minimum variance ratio that we consider relevant. # We will count the number of features with explained variance higher than the cut-off defined. # Then, we will reduce the dimensionality to the number of features with variance higher than the cut-off. variance_cut_off = 0.01 num_components = 0 scoring_methods = ['spearman', 'dot_product', 'jaccard'] df_method = df[columns_method] df_val = df_validation[columns_method] df_all = pd.concat([df_method, df_val]) df_raw = df_all.drop('combination', axis=1) raw_columns = copy.copy(columns_method) raw_columns.remove('combination') pca = PCA(n_components=None) pca.fit(df_raw) values_trans = pca.transform(df_raw) explained_variance = pca.explained_variance_ratio_ for var in explained_variance: if var > variance_cut_off: num_components += 1 if num_components < len(raw_columns): print('Number of features:\t{}\n'.format(len(raw_columns))) print('Reduction to {} components\n'.format(num_components)) pca = PCA(n_components=num_components) pca.fit(df_raw) values_trans = pca.transform(df_raw) indexes = df_all.index.values df_trans = pd.DataFrame.from_records(values_trans, index=indexes) df_comb = df_all[['combination']] df_pca = pd.concat([df_trans, df_comb], axis=1) train_indexes = df_method.index.values val_indexes = df_val.index.values df_method = df_pca.loc[train_indexes] dc_data = df_method[df_method['combination'] == 1] ndc_data = df_method[df_method['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) df_val = df_pca.loc[val_indexes] dc_data_val = df_val[df_val['combination'] == 1] ndc_data_val = df_val[df_val['combination'] == 0] num_dc_val = len(dc_data_val.index) num_ndc_val = len(ndc_data_val.index) else: # Manually introduced features guild_thresholds = [1, 5] rank_scoring = ['spearman', 'dot_product'] list_scoring = ['jaccard'] if method == 'Combination' or method == 'random': selected_columns = diana_analysis.obtain_columns_best_features( guild_thresholds, rank_scoring, list_scoring, ATC_SE=consider_se) else: selected_columns = diana_analysis.obtain_columns_best_features_for_specific_method( method, guild_thresholds, rank_scoring, list_scoring) # Remove ATC columns if different ATC if options.different_atc and consider_se: if method != 'dcatc': selected_columns = [ col for col in selected_columns if col not in dcatc_columns or col == 'combination' ] print('Selected columns: {}\n'.format(', '.join(selected_columns))) print('Number of selected features: {}\n'.format( len(selected_columns) - 1)) # We take away the combinations column # Define the new table with the selected columns df_method = df[selected_columns] dc_data = df_method[df_method['combination'] == 1] ndc_data = df_method[df_method['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) # Define also the validation table with new columns df_val = df_validation[selected_columns] dc_data_val = df_val[df_val['combination'] == 1] ndc_data_val = df_val[df_val['combination'] == 0] num_dc_val = len(dc_data_val.index) num_ndc_val = len(ndc_data_val.index) #-------------------------# # CLASSIFY DRUG PAIRS # #-------------------------# if options.without_repetition: # from sklearn.model_selection import train_test_split # data, target = df_method.iloc[:, :-1], df_method.iloc[:, -1] # X_train, X_test, y_train, y_test = train_test_split( # data, target, test_size=0.1, random_state=0) # clf = classifiers[classifier].fit(X_train, y_train) # y_pred = clf.predict(X_test) # fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred) # auc = metrics.roc_auc_score(y_test, y_pred) # analysis_results[method] = auc # print('Method: {}. AUC {}.'.format(method, auc)) # print(fpr) # print(tpr) # Calculate the number of items per group num_items_group_dc = int(float(num_dc) / float(n_fold)) num_items_group_ndc = int(float(num_ndc) / float(n_fold)) print('Building {} groups of {} drug combinations'.format( n_fold, num_items_group_dc)) dc_groups = diana_analysis.obtain_n_groups_of_k_length( dc_data, n_fold, num_items_group_dc, me_too_drug_combinations) print('Building {} groups of {} non-drug combinations'.format( n_fold, num_items_group_ndc)) ndc_groups = diana_analysis.obtain_n_groups_of_k_length( ndc_data, n_fold, num_items_group_ndc, me_too_drug_combinations) merged_groups = [ pd.concat([x, y]) for x, y in zip(dc_groups, ndc_groups) ] if method == 'random': mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_dummy( n_fold, merged_groups, classifiers[classifier]) else: mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_scikit_with_prob( n_fold, merged_groups, classifiers[classifier]) analysis_results[method] = mean method_to_results[method] = (mean, std) method_to_probs[method] = list_prob print('Method: {}. AUC mean {}. AUC results: {}'.format( method, mean, list_auc)) else: print( 'Building {} repetition groups of {} (same) DC and {} (different) non-DC' .format(repetitions, num_dc, num_dc)) ndc_repetitions = diana_analysis.obtain_n_groups_of_k_length( ndc_data, repetitions, num_dc ) # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times mean_aucs = [ ] # Here we will store the means of AUCs from the cross-validations std_aucs = [ ] # Here we will store the standard deviations of the AUCs from the cross-validations all_aucs = [] # Here we will store ALL the AUCs all_probs = [] # Here we store all the probabilities and labels if cross_validation: num_repetitions = 0 for ndc_data_equal in ndc_repetitions: num_repetitions += 1 num_items_group = int( float(num_dc) / float(n_fold) ) # Calculate the number of items in each group of the cross-validation if num_repetitions == 1: print( 'Building {} fold groups of {} DC and {} non-DC x {} repetitions' .format(n_fold, num_items_group, num_items_group, repetitions)) dc_groups = diana_analysis.obtain_n_groups_of_k_length( dc_data, n_fold, num_items_group, me_too_drug_combinations ) # Defining the drug combination groups in each cross-validation step ndc_groups = diana_analysis.obtain_n_groups_of_k_length( ndc_data_equal, n_fold, num_items_group, me_too_drug_combinations ) # Defining the non-drug combination groups in each cross-validation step merged_groups = [ pd.concat([x, y]) for x, y in zip(dc_groups, ndc_groups) ] if method == 'random': mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_dummy( n_fold, merged_groups, classifiers[classifier]) else: mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_scikit_with_prob( n_fold, merged_groups, classifiers[classifier]) mean_aucs.append(mean) std_aucs.append(std) all_aucs = all_aucs + list_auc all_probs = all_probs + list_prob final_mean = np.mean(all_aucs) #final_mean = np.mean(mean_aucs) std = np.std(all_aucs) mean_std = np.mean(std_aucs) std_means = np.std(mean_aucs) print('FINAL MEAN: {}'.format(final_mean)) print('STD: {}\n'.format(std)) #print('MEAN of STD: {}'.format(mean_std)) else: ndc_repetitions_val = diana_analysis.obtain_n_groups_of_k_length( ndc_data_val, repetitions, num_dc_val) num_repetitions = 0 for ndc_data_equal in ndc_repetitions: ndc_data_equal_val = ndc_repetitions_val[num_repetitions] num_repetitions += 1 train = pd.concat([dc_data, ndc_data_equal]) test = pd.concat([dc_data_val, ndc_data_equal_val]) X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1] X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1] if method == 'random': clf = DummyClassifier().fit(X_train, y_train) y_pred = clf.predict(X_test) else: clf = classifiers[classifier].fit(X_train, y_train) y_pred = clf.predict(X_test) auc = metrics.roc_auc_score(y_test, y_pred) all_aucs.append(auc) # Get probabilities of being drug combination prob = clf.predict_proba( X_test ) # Get the probability used to classify. This is a list, and there is a probability for each class classes = clf.classes_ # This is the order of the classes. The probabilities are given in this order for index in xrange(len(classes)): if classes[index] == 1: dc_index = index # Obtain in which position is located the probability of being drug combination for p in xrange(len(prob)): dc_prob = prob[p][ dc_index] # We use the index to obtain the probability of being drug combination dc_label = y_test[p] dc_name = y_test.index.values[ p] # We obtain the name of the drug combination array = [ dc_prob, dc_label, dc_name ] # Create an array with the probability, the label and the name of the pair all_probs.append(array) # Append the array in all_prob final_mean = np.mean(all_aucs) std = np.std(all_aucs) print('FINAL MEAN: {}'.format(final_mean)) print('STD: {}\n'.format(std)) # Store the distribution of AUCs in the dictionary analysis_results[method] = all_aucs method_to_results[method] = (final_mean, std) method_to_probs[method] = all_probs if options.without_repetition: pass else: #------------------------------# # PLOT DISTRIBUTION OF AUC # #------------------------------# methods_without_atc = copy.copy(methods_ordered) methods_without_atc.remove('dcatc') all_data = [analysis_results[method] for method in methods_without_atc] data_labels = [ method_to_label[method] for method in methods_without_atc ] fig = pylab.figure(dpi=300) ax = pylab.axes() pos = 1 all_positions = [] for x in xrange(len(methods_without_atc)): # plot violin plot print(data_labels[x]) print(all_data[x]) parts = ax.violinplot(all_data[x], positions=[pos], showmeans=False, showmedians=True) all_positions.append(pos) pos += 2 # Change color of the body for pc in parts['bodies']: pc.set_facecolor(colors_ordered[x][0]) # Change color of the segments parts['cmedians'].set_color(colors_ordered[x][1]) parts['cbars'].set_color(colors_ordered[x][1]) parts['cmins'].set_color(colors_ordered[x][1]) parts['cmaxes'].set_color(colors_ordered[x][1]) # adding horizontal grid lines ax.yaxis.grid(True) ax.set_xticks([y + 1 for y in range(len(all_data))]) ax.set_ylabel('Distribution of AUC values') # add x-tick labels plt.setp(ax, xticks=all_positions, xticklabels=data_labels) #plt.xticks(rotation=15) # Save pylab.savefig(plot_name, format=fig_format) plt.show() #---------------------------------# # PRINT THE RESULTS IN A FILE # #---------------------------------# tables_dir = os.path.join(analysis_dir, 'tables') create_directory(tables_dir) output_file = os.path.join( tables_dir, 'general_performance{}{}.txt'.format(atc_str, pca_str)) with open(output_file, 'w') as output_fd: for method, results in sorted(method_to_results.iteritems(), key=lambda (x, y): y[0], reverse=True): output_fd.write('{}\t{}\t{}\n'.format(method, results[0], results[1])) #-------------------------------------------------------------------# # TABLE OF COMPARISON OF AUC DISTRIBUTIONS USING MANN WHITNEY U # #-------------------------------------------------------------------# mannwhitney_file = os.path.join( tables_dir, 'general_performance_mannwhitney{}{}.txt'.format(atc_str, pca_str)) with open(mannwhitney_file, 'w') as mannwhitney_fd: mann_results = {} mannwhitney_fd.write(' ') for method in methods_ordered: mannwhitney_fd.write('\t{}'.format(method_to_label[method])) mannwhitney_fd.write('\n') # Perform the comparisons for method1 in methods_ordered: mann_results.setdefault(method1, {}) for method2 in methods_ordered: if method1 == method2: mann_results[method1][method2] = '-' else: method1_dist = analysis_results[method1] method2_dist = analysis_results[method2] stat, pval = scipy.stats.mannwhitneyu( method1_dist, method2_dist) mann_results[method1][method2] = [stat, pval] # Write the table of crossings for method1 in methods_ordered: mannwhitney_fd.write('{}'.format(method_to_label[method1])) for method2 in methods_ordered: if method1 == method2: mannwhitney_fd.write('\t-') else: stat, pval = mann_results[method1][method2] mannwhitney_fd.write('\t{}, {:.2e}'.format(stat, pval)) mannwhitney_fd.write('\n') #-------------------------------------------------------------------------# # PRINT THE MEAN OF PROBABILITIES OF BEING DRUG COMBINATION IN A FILE # #-------------------------------------------------------------------------# prob_file = os.path.join( tables_dir, 'general_performance_probabilities{}{}.txt'.format(atc_str, pca_str)) with open(prob_file, 'w') as prob_fd: for method in methods_ordered: dc2scoresmean = obtain_drug_combination_scores_mean( method_to_probs[method]) for dc, mean in sorted(dc2scoresmean.iteritems(), key=lambda (x, y): y, reverse=True): drug_id1, drug_id2 = dc.split('---') drug1 = diana_id_to_drugbank[drug_id1].upper() drug2 = diana_id_to_drugbank[drug_id2].upper() atcs_drug1 = ', '.join( sorted(set([atc[0] for atc in drugbank_to_atcs[drug1]]))) atcs_drug2 = ', '.join( sorted(set([atc[0] for atc in drugbank_to_atcs[drug2]]))) prob_fd.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format( method, drug1, drug2, atcs_drug1, atcs_drug2, mean)) #-------------------# # SAVE ALL AUCs # #-------------------# auc_file = os.path.join( tables_dir, 'general_performance_aucs{}{}.txt'.format(atc_str, pca_str)) with open(auc_file, 'w') as auc_fd: for method in methods_ordered: auc_fd.write('{}\t{}\n'.format( method, ','.join([str(x) for x in analysis_results[method]]))) # fig = pylab.figure(dpi=300) # ax = pylab.axes() # #pylab.hold(True) # pos = 1 # col_num = 0 # xticks = [] # Define the places in which the labels will be # xlabels = [] # Define the labels (the names of the methods) # #colors = [ ['#9ed0ff, blue'], ['#32f232', 'green'], ['#fbc562', '#d48900'], ['#ff7373', '#b80000'], ['grey', 'black'] ] # for method in methods_ordered: # positions = [] # positions.append(pos) # Define the positions of the boxplots # pos+=2 # Add separation between boxplots # xlabels.append(method_to_label[method]) # Add the method used at the x axis # # Boxplot group # #bp = boxplot(data, positions = positions, widths = 0.6) # bp = pylab.boxplot(analysis_results[method], positions = positions, widths = 0.6, patch_artist=True) # tick = np.mean(positions) # The label will be at the mean of the positions (in the middle) # xticks.append(tick) # # Set axes limits and labels # pylab.xlim(0,pos-1) # pylab.ylim(0,1) # ax.set_xticklabels(xlabels) # ax.set_xticks(xticks) # pylab.xlabel('Type of data') # pylab.ylabel('Distribution of AUC values') # fig.autofmt_xdate() # pylab.savefig(plot_name, format=fig_format) # pylab.show() # End marker for time end = time.time() print( '\n DIANA INFO:\tTIME OF EXECUTION: {:.3f} seconds or {:.3f} minutes.\n' .format(end - start, (end - start) / 60)) return