Example #1
0
def analysis_results(options):
    """
    Analyzes the results of the comparisons
    """

    # Start marker for time measure
    start = time.time()

    print(
        "\n\t\t------------------------------------------------------------------------------------------------------------------------\n"
    )
    print(
        "\t\tStarting Drug Interactions ANAlysis (DIANA), a program created by @OLIVA'S LAB. Analysis of results: Analysis by targets\n"
    )
    print(
        "\t\t------------------------------------------------------------------------------------------------------------------------\n"
    )

    # Get the script path
    main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
    toolbox_dir = os.path.join(main_path, 'diana/toolbox')

    # Check the directory of the profiles, comparisons and analysis
    data_dir = os.path.join(options.workspace, "profiles")
    check_directory(data_dir)

    results_dir = os.path.join(options.workspace, "comparisons")
    check_directory(results_dir)

    analysis_dir = os.path.join(options.workspace, "analysis")
    check_directory(analysis_dir)

    # Get the list of thresholds to create the profiles
    if options.threshold_list and fileExist(options.threshold_list):
        threshold_list = get_values_from_threshold_file(options.threshold_list)
    else:
        threshold_list = [1, 5, 10, 20, 50]

    # Do we consider Side Effects/ATC?
    if options.consider_se:
        consider_se = True
    else:
        consider_se = False

    # Get the names of the columns
    columns = diana_analysis.obtain_columns(threshold_list, ATC_SE=consider_se)

    #-----------------------------------------------------#
    #   PARSE THE RESULTS AND CREATE A PANDAS DATAFRAME   #
    #-----------------------------------------------------#

    pair2comb_file = os.path.join(toolbox_dir, 'pair2comb.pcl')
    pair2comb = cPickle.load(open(pair2comb_file))

    ddi = sum(1 for x in pair2comb.values() if x == 1)
    non_ddi = sum(1 for x in pair2comb.values() if x == 0)

    print('NUMBER OF DRUG COMBINATIONS:\t\t{}\n'.format(ddi))
    print('NUMBER OF NON-DRUG COMBINATIONS:\t{}\n'.format(non_ddi))

    output_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons.csv')

    if not fileExist(output_dataframe):

        # Create a data frame to store the results
        df = pd.DataFrame(columns=columns)

        # Obtain all the results subfolders of the results main folder
        results_dir_list = [
            f for f in os.listdir(results_dir)
            if os.path.isdir(os.path.join(results_dir, f))
        ]

        for comparison in results_dir_list:

            drug_id1, drug_id2 = comparison.split('---')
            comparison_dir = os.path.join(results_dir, comparison)
            results_table = os.path.join(comparison_dir, 'results_table.tsv')

            # Add the Comb field (if it is drug combination or not)
            drug1 = drug_id1.split('_')[0].upper()
            drug2 = drug_id2.split('_')[0].upper()
            comparison_without_id = '{}---{}'.format(drug1, drug2)
            if comparison_without_id in pair2comb:
                combination_field = pair2comb[comparison_without_id]
            else:
                print(
                    'The comparison {} is not in the pair2comb dictionary!\n'.
                    format(comparison_without_id))
                print(pair2comb)
                sys.exit(10)

            if not fileExist(results_table):
                print('The comparison {} has not been executed properly!\n'.
                      format(comparison))
                sys.exit(10)

            results = get_results_from_table(results_table, columns,
                                             combination_field)

            df2 = pd.DataFrame([results], columns=columns, index=[comparison])
            # Add the information to the main data frame
            df = df.append(df2)

        # Output the Pandas dataframe in a CSV file
        df.to_csv(output_dataframe)

    else:
        df = pd.read_csv(output_dataframe, index_col=0)

    #---------------------------#
    #   REMOVE MISSING VALUES   #
    #---------------------------#

    # Replace the None values in dcstructure by nan
    if 'None' in df['dcstructure']:
        df = df.replace(to_replace={'dcstructure': {'None': np.nan}})

    # Remove the nan values in dcstructure
    df = df.dropna()

    # Count the number of drug combinations / non-drug combinations
    dc_data = df[df['combination'] == 1]
    ndc_data = df[df['combination'] == 0]
    num_dc = len(dc_data.index)
    num_ndc = len(ndc_data.index)
    print('Number of drug combinations after removing missing values:\t{}\n'.
          format(num_dc))
    print(
        'Number of non-drug combinations after removing missing values:\t{}\n'.
        format(num_ndc))

    #---------------------------#
    #   IDENTIFY ME-TOO DRUGS   #
    #---------------------------#

    me_too_dir = os.path.join(analysis_dir, 'me_too_drugs')
    create_directory(me_too_dir)
    me_too_drugs_table = os.path.join(me_too_dir, 'me_too_drugs.tsv')
    me_too_drug_combs_table = os.path.join(me_too_dir,
                                           'me_too_drug_combinations.tsv')

    me_too_drug_pairs_file = os.path.join(me_too_dir, 'me_too_drug_pairs.pcl')
    me_too_drug_comb_pairs_file = os.path.join(me_too_dir,
                                               'me_too_drug_comb_pairs.pcl')

    if not fileExist(me_too_drug_pairs_file) or not fileExist(
            me_too_drug_comb_pairs_file):

        df_struc = df[['dcstructure']]
        df_struc = df_struc.astype(float)
        me_too_drug_pairs, me_too_drug_comb_pairs = diana_analysis.obtain_me_too_drugs_and_combinations(
            df_struc, columns, me_too_drugs_table, me_too_drug_combs_table)
        cPickle.dump(me_too_drug_pairs, open(me_too_drug_pairs_file, 'w'))
        cPickle.dump(me_too_drug_comb_pairs,
                     open(me_too_drug_comb_pairs_file, 'w'))

    else:

        me_too_drug_pairs = cPickle.load(open(me_too_drug_pairs_file))
        me_too_drug_comb_pairs = cPickle.load(
            open(me_too_drug_comb_pairs_file))

    # Process me-too drug combination pairs
    me_too_drug_combinations = set()
    drug_pair_to_me_too_times = {}
    for pair in me_too_drug_comb_pairs:
        drug_comb1, drug_comb2 = pair.split('___')
        me_too_drug_combinations.add(frozenset([drug_comb1, drug_comb2]))
        drug_pair_to_me_too_times.setdefault(drug_comb1, 0)
        drug_pair_to_me_too_times.setdefault(drug_comb2, 0)
        drug_pair_to_me_too_times[drug_comb1] += 1
        drug_pair_to_me_too_times[drug_comb2] += 1
    removed_drug_pairs = set()
    for pair in me_too_drug_comb_pairs:
        drug_comb1, drug_comb2 = pair.split('___')
        if drug_comb1 in removed_drug_pairs or drug_comb2 in removed_drug_pairs:
            continue
        if drug_pair_to_me_too_times[drug_comb1] > drug_pair_to_me_too_times[
                drug_comb2]:
            removed_drug_pairs.add(drug_comb1)
        else:
            removed_drug_pairs.add(drug_comb2)

    # Remove the drug pairs which appear in me-too pairs of drug pairs more times
    df = df.loc[~df.index.isin(list(removed_drug_pairs))]

    # Count the number of drug combinations / non-drug combinations
    dc_data = df[df['combination'] == 1]
    ndc_data = df[df['combination'] == 0]
    num_dc = len(dc_data.index)
    num_ndc = len(ndc_data.index)
    print(
        'Number of drug combinations after removing me-too conflictive drug pairs:\t{}\n'
        .format(num_dc))
    print(
        'Number of non-drug combinations after removing me-too conflictive drug pairs:\t{}\n'
        .format(num_ndc))

    #-------------------------#
    #   EVALUATE PERFORMANCE  #
    #-------------------------#

    img_dir = os.path.join(analysis_dir, 'figures')
    create_directory(img_dir)
    fig_format = 'png'

    tables_dir = os.path.join(analysis_dir, 'tables')
    create_directory(tables_dir)

    # Machine learning parameters
    repetitions = 25  # Number of repetititons
    n_fold = 2  # Number of folds
    min_num_dc_group = 10
    greater_or_smaller = 'greater'
    classifier = 'SVC'
    classifiers = {
        'KNeighbors':
        KNeighborsClassifier(3),
        'SVC':
        SVC(probability=True),
        'SVC linear':
        SVC(kernel="linear", C=0.025),
        'SVC rbf':
        SVC(gamma=2, C=1),
        'DecisionTree':
        DecisionTreeClassifier(max_depth=5),
        'RandomForest':
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        'MLP':
        MLPClassifier(alpha=1),
        'AdaBoost':
        AdaBoostClassifier(),
        'GaussianNB':
        GaussianNB(),
        'QuadraticDiscr.':
        QuadraticDiscriminantAnalysis(),
        'SVC best 1':
        SVC(kernel="linear", C=0.1, probability=True),
        'SVC best 2':
        SVC(kernel="rbf", gamma=0.01, C=100.0, probability=True)
    }

    # Plot of distributions of AUC
    plot_name = os.path.join(img_dir,
                             'dcGUILD_1_threshold_auc.{}'.format(fig_format))

    # Get the targets file
    drugbank_to_targets_file = os.path.join(toolbox_dir,
                                            'drugbank_to_targets.pcl')
    drugbank_to_targets = cPickle.load(open(drugbank_to_targets_file))

    # Get the DIANA IDs file
    diana_id_to_drugbank_file = os.path.join(toolbox_dir,
                                             'diana_id_to_drugbank.pcl')
    diana_id_to_drugbank = cPickle.load(open(diana_id_to_drugbank_file))

    print('\nEVALUATION OF DCGUILD\n')
    repetitions = 25
    n_fold = 10
    analysis_results = {}

    # Obtain the different non-drug combination groups to repeat the analysis
    ndc_repetitions = diana_analysis.obtain_n_groups_of_k_length(
        ndc_data, repetitions, num_dc
    )  # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times

    # dcGUILD_features = [str(x) for x in threshold_list]
    # dcGUILD_feature_to_columns = {}
    # # Get dcGUILD columns
    # for top_threshold in threshold_list:
    #     for data_type in ['node', 'edge', 'function']:
    #         for scoring_function in ['dot_product', 'spearman', 'jaccard']:
    #             col = 'dcg'+'_'+data_type+'_'+str(top_threshold)+'_'+scoring_function
    #             dcGUILD_feature_to_columns.setdefault(str(top_threshold), [])
    #             dcGUILD_feature_to_columns[str(top_threshold)].append(col)
    #     dcGUILD_feature_to_columns[str(top_threshold)].append('combination')

    dcGUILD_features = []
    dcGUILD_feature_to_columns = {}
    # Get dcGUILD columns
    for top_threshold in [1]:
        for data_type in ['node', 'edge', 'function']:
            for scoring_function in ['dot_product', 'spearman', 'jaccard']:
                col = 'dcg' + '_' + data_type + '_' + str(
                    top_threshold) + '_' + scoring_function
                dcGUILD_features.append(col)
                dcGUILD_feature_to_columns[col] = [col, 'combination']

    for feature in dcGUILD_features:

        df_method = df[dcGUILD_feature_to_columns[feature]]

        dc_data = df_method[df_method['combination'] == 1]
        ndc_data = df_method[df_method['combination'] == 0]
        num_dc = len(dc_data.index)
        num_ndc = len(ndc_data.index)

        print(feature)
        print(
            'Building {} repetition groups of {} (same) DC and {} (different) non-DC'
            .format(repetitions, num_dc, num_dc))
        ndc_repetitions = diana_analysis.obtain_n_groups_of_k_length(
            ndc_data, repetitions, num_dc
        )  # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times

        mean_aucs = [
        ]  # Here we will store the means of AUCs from the cross-validations
        std_aucs = [
        ]  # Here we will store the standard deviations of the AUCs from the cross-validations
        all_aucs = []  # Here we will store ALL the AUCs
        all_probs = []  # Here we store all the probabilities and labels

        num_repetitions = 0
        for ndc_data_equal in ndc_repetitions:

            num_repetitions += 1
            num_items_group = int(
                float(num_dc) / float(n_fold)
            )  # Calculate the number of items in each group of the cross-validation
            if num_repetitions == 1:
                print(
                    'Building {} fold groups of {} DC and {} non-DC x {} repetitions'
                    .format(n_fold, num_items_group, num_items_group,
                            repetitions))

            dc_groups = diana_analysis.obtain_n_groups_of_k_length(
                dc_data, n_fold, num_items_group, me_too_drug_combinations
            )  # Defining the drug combination groups in each cross-validation step
            ndc_groups = diana_analysis.obtain_n_groups_of_k_length(
                ndc_data_equal, n_fold, num_items_group,
                me_too_drug_combinations
            )  # Defining the non-drug combination groups in each cross-validation step
            merged_groups = [
                pd.concat([x, y]) for x, y in zip(dc_groups, ndc_groups)
            ]

            mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_scikit_with_prob(
                n_fold, merged_groups, classifiers[classifier])

            mean_aucs.append(mean)
            std_aucs.append(std)
            all_aucs = all_aucs + list_auc
            all_probs = all_probs + list_prob

        final_mean = np.mean(mean_aucs)
        mean_std = np.mean(std_aucs)
        std_means = np.std(mean_aucs)
        std = np.std(all_aucs)
        print('FINAL MEAN: {}'.format(final_mean))
        print('MEAN of STD: {}'.format(mean_std))
        print('STD: {}\n'.format(std))

        # Store the distribution of AUCs in the dictionary
        analysis_results[feature] = all_aucs

    #------------------------------#
    #   PLOT DISTRIBUTION OF AUC   #
    #------------------------------#

    fig = pylab.figure(dpi=300)
    ax = pylab.axes()
    #pylab.hold(True)
    pos = 1
    col_num = 0

    xticks = []  # Define the places in which the labels will be
    xlabels = []  # Define the labels (the names of the features)
    #colors = [ ['#9ed0ff, blue'], ['#32f232', 'green'], ['#fbc562', '#d48900'], ['#ff7373', '#b80000'], ['grey', 'black'] ]

    for feature in dcGUILD_features:

        positions = []
        positions.append(pos)  # Define the positions of the boxplots
        pos += 2  # Add separation between boxplots
        xlabels.append(feature)  # Add the feature used at the x axis

        # Boxplot group
        #bp = boxplot(data, positions = positions, widths = 0.6)
        bp = pylab.boxplot(analysis_results[feature],
                           positions=positions,
                           widths=0.6,
                           patch_artist=True)

        tick = np.mean(
            positions
        )  # The label will be at the mean of the positions (in the middle)
        xticks.append(tick)

    # Set axes limits and labels
    pylab.xlim(0, pos - 1)
    pylab.ylim(0, 1)
    ax.set_xticklabels(xlabels)
    ax.set_xticks(xticks)
    pylab.xlabel('Features')
    pylab.ylabel('Distribution of AUC values')

    fig.autofmt_xdate()
    pylab.savefig(plot_name, format=fig_format)
    pylab.show()

    # End marker for time
    end = time.time()
    print(
        '\n  DIANA INFO:\tTIME OF EXECUTION: {:.3f} seconds or {:.3f} minutes.\n'
        .format(end - start, (end - start) / 60))

    return
Example #2
0
def analysis_results(options):
    """
    Analyzes the results of the comparisons
    """

    # Start marker for time measure
    start = time.time()

    print("\n\t\t----------------------------------------------------------------------------------------------------------------------------\n")
    print("\t\tStarting Drug Interactions ANAlysis (DIANA), a program created by @OLIVA'S LAB. Analysis of results: Selection of classifier\n")
    print("\t\t----------------------------------------------------------------------------------------------------------------------------\n")

    # Get the script path
    main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
    toolbox_dir = os.path.join(main_path, 'diana/toolbox')

    # Check the directory of the profiles and comparisons
    data_dir = os.path.join(options.workspace, "profiles")
    check_directory(data_dir)

    results_dir = os.path.join(options.workspace, "comparisons")
    check_directory(results_dir)

    # Create a directory for the analysis inside the workspace
    analysis_dir = os.path.join(options.workspace, "analysis")
    create_directory(analysis_dir)

    # Create a directory for the analysis of the comparison with other methods
    if options.comparison_other_methods:
        analysis_dir = os.path.join(options.workspace, "analysis_comparison")
        create_directory(analysis_dir)

    # Get the list of thresholds to create the profiles
    if options.threshold_list and fileExist(options.threshold_list):
        threshold_list = get_values_from_threshold_file(options.threshold_list)
    else:
        threshold_list = [1, 5, 10, 20, 50]

    # Do we consider Side Effects/ATC?
    if options.consider_se:
        consider_se = True
    else:
        consider_se = False

    # Get the names of the columns
    columns = diana_analysis.obtain_columns(threshold_list, ATC_SE=consider_se)



    #-----------------------------------------------------#
    #   PARSE THE RESULTS AND CREATE A PANDAS DATAFRAME   #
    #-----------------------------------------------------#

    pair2comb_file = os.path.join(toolbox_dir, 'pair2comb.pcl')
    pair2comb = cPickle.load(open(pair2comb_file))

    ddi = sum(1 for x in pair2comb.values() if x == 1)
    non_ddi = sum(1 for x in pair2comb.values() if x == 0)

    print('NUMBER OF DRUG COMBINATIONS:\t\t{}\n'.format(ddi))
    print('NUMBER OF NON-DRUG COMBINATIONS:\t{}\n'.format(non_ddi))

    output_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons.csv')

    # Change the name of the output file if we are doing a comparison with other methods
    if options.comparison_other_methods:
        output_dataframe = os.path.join(analysis_dir, 'comparison_other_methods.csv')

    if not fileExist(output_dataframe):

        # Create a data frame to store the results
        df = pd.DataFrame(columns=columns)

        # Prepare files
        network_filename = ntpath.basename(options.sif)
        drugbank2targets_file = os.path.join(toolbox_dir, 'drugbank_to_targets.pcl')
        drug2targets = cPickle.load(open(drugbank2targets_file))

        # Open the crossings file
        crossings_file = options.crossings_file
        with open(crossings_file, 'r') as crossings_file_fd:
            for line in crossings_file_fd:
                crossing = line.strip()
                drug1, drug2 = crossing.split('---')

                # Get drug IDs
                targets1 = list(drug2targets[drug1.upper()])
                drug_id1 = diana_drug.generate_drug_id(drug1, targets1, network_filename)
                targets2 = list(drug2targets[drug2.upper()])
                drug_id2 = diana_drug.generate_drug_id(drug2, targets2, network_filename)

                # Check results table
                comparison = '{}---{}'.format(drug_id1, drug_id2)
                comparison_dir = os.path.join(results_dir, comparison)
                results_table = os.path.join(comparison_dir, 'results_table.tsv')
                if not fileExist(results_table):
                    print('The comparison of {} ({}) and {} ({}) has not been executed properly!\n'.format(drug1, drug_id1, drug2, drug_id2))
                    sys.exit(10)

                if crossing in pair2comb:
                    combination_field = pair2comb[crossing]
                else:
                    print('The comparison {} is not in the pair2comb dictionary!\n'.format(crossing))
                    sys.exit(10)

                results = diana_analysis.get_results_from_table(results_table, columns, combination_field)

                df2 = pd.DataFrame([results], columns=columns, index=[comparison])
                # Add the information to the main data frame
                df = df.append(df2)

        # Output the Pandas dataframe in a CSV file
        df.to_csv(output_dataframe)

    else:
        df = pd.read_csv(output_dataframe, index_col=0)



    #---------------------------#
    #   REMOVE MISSING VALUES   #
    #---------------------------#

    # Replace the None values in dcstructure by nan
    if 'None' in df['dcstructure']:
        df = df.replace(to_replace={'dcstructure':{'None':np.nan}})

    # Remove the nan values in dcstructure
    df = df.dropna()

    # Count the number of drug combinations / non-drug combinations
    dc_data = df[df['combination'] == 1]
    ndc_data = df[df['combination'] == 0]
    num_dc = len(dc_data.index)
    num_ndc = len(ndc_data.index)
    print('Number of drug combinations after removing missing values:\t{}\n'.format(num_dc))
    print('Number of non-drug combinations after removing missing values:\t{}\n'.format(num_ndc))



    #---------------------------#
    #   IDENTIFY ME-TOO DRUGS   #
    #---------------------------#

    me_too_dir = os.path.join(analysis_dir, 'me_too_drugs')
    create_directory(me_too_dir)
    me_too_drugs_table = os.path.join(me_too_dir, 'me_too_drugs.tsv')
    me_too_drug_combs_table = os.path.join(me_too_dir, 'me_too_drug_combinations.tsv')

    me_too_drug_pairs_file = os.path.join(me_too_dir, 'me_too_drug_pairs.pcl')
    me_too_drug_comb_pairs_file = os.path.join(me_too_dir, 'me_too_drug_comb_pairs.pcl')

    if not fileExist(me_too_drug_pairs_file) or not fileExist(me_too_drug_comb_pairs_file):

        df_struc = df[['dcstructure']]
        df_struc = df_struc.astype(float)
        me_too_drug_pairs, me_too_drug_comb_pairs = diana_analysis.obtain_me_too_drugs_and_combinations(df_struc, columns, me_too_drugs_table, me_too_drug_combs_table)
        cPickle.dump(me_too_drug_pairs, open(me_too_drug_pairs_file, 'w'))
        cPickle.dump(me_too_drug_comb_pairs, open(me_too_drug_comb_pairs_file, 'w'))

    else:

        me_too_drug_pairs = cPickle.load(open(me_too_drug_pairs_file))
        me_too_drug_comb_pairs = cPickle.load(open(me_too_drug_comb_pairs_file))

    # Process me-too drug combination pairs
    me_too_drug_combinations = set()
    drug_pair_to_me_too_times = {}
    for pair in me_too_drug_comb_pairs:
        drug_comb1, drug_comb2 = pair.split('___')
        me_too_drug_combinations.add(frozenset([drug_comb1, drug_comb2]))
        drug_pair_to_me_too_times.setdefault(drug_comb1, 0)
        drug_pair_to_me_too_times.setdefault(drug_comb2, 0)
        drug_pair_to_me_too_times[drug_comb1] += 1
        drug_pair_to_me_too_times[drug_comb2] += 1
    removed_drug_pairs = set()
    for pair in me_too_drug_comb_pairs:
        drug_comb1, drug_comb2 = pair.split('___')
        if drug_comb1 in removed_drug_pairs or drug_comb2 in removed_drug_pairs:
            continue
        if drug_pair_to_me_too_times[drug_comb1] > drug_pair_to_me_too_times[drug_comb2]:
            removed_drug_pairs.add(drug_comb1)
        else:
            removed_drug_pairs.add(drug_comb2)

    # Remove the drug pairs which appear in me-too pairs of drug pairs more times
    df = df.loc[~df.index.isin(list(removed_drug_pairs))]

    # Count the number of drug combinations / non-drug combinations
    dc_data = df[df['combination'] == 1]
    ndc_data = df[df['combination'] == 0]
    num_dc = len(dc_data.index)
    num_ndc = len(ndc_data.index)
    print('Number of drug combinations after removing me-too conflictive drug pairs:\t{}\n'.format(num_dc))
    print('Number of non-drug combinations after removing me-too conflictive drug pairs:\t{}\n'.format(num_ndc))


    #-----------------------------------------------------------#
    #   DIVIDE THE DATASET IN A TRAINING AND A VALIDATION SET   #
    #-----------------------------------------------------------#

    training_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons_training.csv')
    validation_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons_validation.csv')
    proportion_training = 0.8

    # Change the name of the output file if we are doing a comparison with other methods
    if options.comparison_other_methods:
        training_dataframe = os.path.join(analysis_dir, 'comparison_other_methods_training.csv')
        validation_dataframe = os.path.join(analysis_dir, 'comparison_other_methods_validation.csv')

    if not fileExist(training_dataframe) or not fileExist(validation_dataframe):

        num_dc_training = int(round(num_dc*proportion_training))
        num_ndc_training = int(round(num_ndc*proportion_training))
        print('Training set (positives): {} out of {} ({}%)\n'.format(num_dc_training, num_dc, proportion_training*100))
        print('Training set (negatives): {} out of {} ({}%)\n'.format(num_ndc_training, num_ndc, proportion_training*100))

        dc_data_training = dc_data.sample(n=num_dc_training) # Get a random sample
        ndc_data_training = ndc_data.sample(n=num_ndc_training)
        dc_data_validation = dc_data.loc[~dc_data.index.isin(dc_data_training.index)]  # Remove the sample that we have taken from the dataframe
        ndc_data_validation = ndc_data.loc[~ndc_data.index.isin(ndc_data_training.index)]

        df_training = pd.concat([dc_data_training, ndc_data_training])
        df_validation = pd.concat([dc_data_validation, ndc_data_validation])

        # Output the Pandas dataframes in a CSV file
        df_training.to_csv(training_dataframe)
        df_validation.to_csv(validation_dataframe)

        # Define the variables for the training dataset
        df = df_training
        dc_data = df[df['combination'] == 1]
        ndc_data = df[df['combination'] == 0]
        num_dc = len(dc_data.index)
        num_ndc = len(ndc_data.index)
        print('Number of drug combinations after getting the training dataset:\t{}\n'.format(num_dc))
        print('Number of non-drug combinations after getting the training dataset:\t{}\n'.format(num_ndc))

    else:
        df_training = pd.read_csv(training_dataframe, index_col=0)
        df_validation = pd.read_csv(validation_dataframe, index_col=0)

        # Define the variables for the training dataset
        df = df_training
        dc_data = df[df['combination'] == 1]
        ndc_data = df[df['combination'] == 0]
        num_dc = len(dc_data.index)
        num_ndc = len(ndc_data.index)
        print('Number of drug combinations after getting the training dataset:\t{}\n'.format(num_dc))
        print('Number of non-drug combinations after getting the training dataset:\t{}\n'.format(num_ndc))



    #------------------------------------------------------------------#
    #   SELECT RELEVANT FEATURES / REDUCE DIMENSIONALITY OF THE DATA   #
    #------------------------------------------------------------------#

    if options.pca:

        # Strategy:
        # We calculate the explained variance ratio for all the features.
        # We define a a cut-off threshold of the minimum variance ratio that we consider relevant.
        # We will count the number of features with explained variance higher than the cut-off defined.
        # Then, we will reduce the dimensionality to the number of features with variance higher than the cut-off.

        variance_cut_off = 0.01
        num_components = 0
        df_raw = df.drop('combination', axis=1)
        raw_columns = copy.copy(columns)
        raw_columns.remove('combination')
        pca = PCA(n_components=None)
        pca.fit(df_raw)
        values_trans = pca.transform(df_raw)
        explained_variance = pca.explained_variance_ratio_
        for var in explained_variance:
            if var > variance_cut_off:
                num_components += 1

        if num_components < len(raw_columns):

            print('Number of features:\t{}\n'.format(len(raw_columns)))
            print('Reduction to {} components\n'.format(num_components))

            pca = PCA(n_components=num_components)
            pca.fit(df_raw)
            values_trans = pca.transform(df_raw)
            indexes = df.index.values
            df_trans = pd.DataFrame.from_records(values_trans, index=indexes)
            df_comb = df[['combination']]
            df_new = pd.concat([df_trans, df_comb], axis=1)
            df = df_new
            dc_data = df[df['combination'] == 1]
            ndc_data = df[df['combination'] == 0]
            num_dc = len(dc_data.index)
            num_ndc = len(ndc_data.index)

    else:

        # Manually introduced features
        guild_thresholds = [1, 5]
        rank_scoring = ['spearman', 'dot_product']
        list_scoring = ['jaccard']
        selected_columns = diana_analysis.obtain_columns_best_features(guild_thresholds, rank_scoring, list_scoring, ATC_SE=consider_se)
        print('Selected columns: {}\n'.format(', '.join(selected_columns)))
        print('Number of selected features: {}\n'.format(len(selected_columns)-1)) # We take away the combinations column

        # Define the new table with the selected columns
        df = df[selected_columns]
        dc_data = df[df['combination'] == 1]
        ndc_data = df[df['combination'] == 0]
        num_dc = len(dc_data.index)
        num_ndc = len(ndc_data.index)
 

    #------------------------------------------#
    #   TUNE THE ALGORITHM OF THE CLASSIFIER   #
    #------------------------------------------#

    tables_dir = os.path.join(analysis_dir, 'tables')
    create_directory(tables_dir)
    results_table = os.path.join(tables_dir, 'tuning_results.tsv')
    classifiers = {
        'KNeighbors' : KNeighborsClassifier(3),
        'SVC' : SVC(probability=True),
        'SVC linear' : SVC(kernel="linear", C=0.025),
        'SVC rbf' : SVC(gamma=2, C=1),
        'DecisionTree' : DecisionTreeClassifier(max_depth=5),
        'RandomForest' : RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        'MLP' : MLPClassifier(alpha=1),
        'AdaBoost' : AdaBoostClassifier(),
        'GaussianNB' : GaussianNB(),
        'QuadraticDiscr.' : QuadraticDiscriminantAnalysis()
    }
    classifier = 'SVC'
    pipe_svc = Pipeline([('slc', StandardScaler()),
                         ('clf', SVC(random_state=1))])

    param_range_c = [1.0, 10.0, 100]
    param_range_gamma = [1e-4, 1e-3, 0.01, 0.1]
    param_grid = [{'clf__C': param_range_c,
                   'clf__kernel': ['linear']},
                  {'clf__C': param_range_c,
                   'clf__gamma': param_range_gamma,
                   'clf__kernel': ['rbf']}]


    print('TUNNING THE ALGORITHM OF {}\n'.format(classifier.upper()))
    rounds = 2
    repetitions = 25
    dict_results = {}

    for n_round in xrange(rounds):

        print('ROUND NUMBER {}\n'.format(n_round+1))

        # Obtain the different non-drug combination groups to repeat the analysis
        ndc_training_groups = diana_analysis.obtain_n_groups_of_k_length(ndc_data, repetitions, num_dc) # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times

        for ndc_training_data in ndc_training_groups:

            merged_groups = pd.concat([dc_data, ndc_training_data])
            X_train, y_train = merged_groups.iloc[:, :-1], merged_groups.iloc[:, -1]
            grid_search = GridSearchCV(estimator=pipe_svc,
                          param_grid=param_grid,
                          scoring='accuracy',
                          cv=10,
                          n_jobs=-1)
            grid = grid_search.fit(X_train, y_train)
            print(grid)
            # summarize the results of the grid search
            print('Grid best score: {}'.format(grid.best_score_))
            result = str(grid.best_params_)
            print('Grid best parameters: {}\n'.format(result))
            dict_results.setdefault(result, 0)
            dict_results[result] += 1

    print('\nFINAL RESULT\n')

    with open(results_table, 'w') as results_table_fd:

        for param_comb in sorted(dict_results, reverse = True):

            print('{}\t{}\n'.format(param_comb, dict_results[param_comb]))
            results_table_fd.write('{}\t{}\n'.format(param_comb, dict_results[param_comb]))



    # End marker for time
    end = time.time()
    print('\n  DIANA INFO:\tTIME OF EXECUTION: {:.3f} seconds or {:.3f} minutes.\n'.format(end - start, (end - start) / 60))



    return
Example #3
0
def analysis_results(options):
    """
    Analyzes the results of the comparisons
    """

    # Start marker for time measure
    start = time.time()

    print(
        "\n\t\t----------------------------------------------------------------------------------------------------------------------------\n"
    )
    print(
        "\t\tStarting Drug Interactions ANAlysis (DIANA), a program created by @OLIVA'S LAB. Analysis of results: Selection of classifier\n"
    )
    print(
        "\t\t----------------------------------------------------------------------------------------------------------------------------\n"
    )

    # Get the script path
    main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
    toolbox_dir = os.path.join(main_path, 'diana/toolbox')

    # Check the directory of the profiles and comparisons
    data_dir = os.path.join(options.workspace, "profiles")
    check_directory(data_dir)

    results_dir = os.path.join(options.workspace, "comparisons")
    check_directory(results_dir)

    # Create a directory for the analysis inside the workspace
    analysis_dir = os.path.join(options.workspace, "analysis")
    create_directory(analysis_dir)

    # Create a directory for the analysis of the comparison with other methods
    if options.comparison_other_methods:
        analysis_dir = os.path.join(options.workspace, "analysis_comparison")
        create_directory(analysis_dir)

    # Get the list of thresholds to create the profiles
    if options.threshold_list and fileExist(options.threshold_list):
        threshold_list = get_values_from_threshold_file(options.threshold_list)
    else:
        threshold_list = [1, 5, 10, 20, 50]

    # Do we consider Side Effects/ATC?
    if options.consider_se:
        consider_se = True
    else:
        consider_se = False

    # Get the names of the columns
    columns = diana_analysis.obtain_columns(threshold_list, ATC_SE=consider_se)

    #-----------------------------------------------------#
    #   PARSE THE RESULTS AND CREATE A PANDAS DATAFRAME   #
    #-----------------------------------------------------#

    pair2comb_file = os.path.join(toolbox_dir, 'pair2comb.pcl')
    pair2comb = cPickle.load(open(pair2comb_file))

    ddi = sum(1 for x in pair2comb.values() if x == 1)
    non_ddi = sum(1 for x in pair2comb.values() if x == 0)

    print('NUMBER OF DRUG COMBINATIONS:\t\t{}\n'.format(ddi))
    print('NUMBER OF NON-DRUG COMBINATIONS:\t{}\n'.format(non_ddi))

    output_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons.csv')

    # Change the name of the output file if we are doing a comparison with other methods
    if options.comparison_other_methods:
        output_dataframe = os.path.join(analysis_dir,
                                        'comparison_other_methods.csv')

    if not fileExist(output_dataframe):

        # Create a data frame to store the results
        df = pd.DataFrame(columns=columns)

        # Prepare files
        network_filename = ntpath.basename(options.sif)
        drugbank2targets_file = os.path.join(toolbox_dir,
                                             'drugbank_to_targets.pcl')
        drug2targets = cPickle.load(open(drugbank2targets_file))

        # Open the crossings file
        crossings_file = options.crossings_file
        with open(crossings_file, 'r') as crossings_file_fd:
            for line in crossings_file_fd:
                crossing = line.strip()
                drug1, drug2 = crossing.split('---')

                # Get drug IDs
                targets1 = list(drug2targets[drug1.upper()])
                drug_id1 = diana_drug.generate_drug_id(drug1, targets1,
                                                       network_filename)
                targets2 = list(drug2targets[drug2.upper()])
                drug_id2 = diana_drug.generate_drug_id(drug2, targets2,
                                                       network_filename)

                # Check results table
                comparison = '{}---{}'.format(drug_id1, drug_id2)
                comparison_dir = os.path.join(results_dir, comparison)
                results_table = os.path.join(comparison_dir,
                                             'results_table.tsv')
                if not fileExist(results_table):
                    print(
                        'The comparison of {} ({}) and {} ({}) has not been executed properly!\n'
                        .format(drug1, drug_id1, drug2, drug_id2))
                    sys.exit(10)

                if crossing in pair2comb:
                    combination_field = pair2comb[crossing]
                else:
                    print(
                        'The comparison {} is not in the pair2comb dictionary!\n'
                        .format(crossing))
                    sys.exit(10)

                results = diana_analysis.get_results_from_table(
                    results_table, columns, combination_field)

                df2 = pd.DataFrame([results],
                                   columns=columns,
                                   index=[comparison])
                # Add the information to the main data frame
                df = df.append(df2)

        # Output the Pandas dataframe in a CSV file
        df.to_csv(output_dataframe)

    else:
        df = pd.read_csv(output_dataframe, index_col=0)

    #---------------------------#
    #   REMOVE MISSING VALUES   #
    #---------------------------#

    # Replace the None values in dcstructure by nan
    if 'None' in df['dcstructure']:
        df = df.replace(to_replace={'dcstructure': {'None': np.nan}})

    # Remove the nan values in dcstructure
    df = df.dropna()

    # Count the number of drug combinations / non-drug combinations
    dc_data = df[df['combination'] == 1]
    ndc_data = df[df['combination'] == 0]
    num_dc = len(dc_data.index)
    num_ndc = len(ndc_data.index)
    print('Number of drug combinations after removing missing values:\t{}\n'.
          format(num_dc))
    print(
        'Number of non-drug combinations after removing missing values:\t{}\n'.
        format(num_ndc))

    #---------------------------#
    #   IDENTIFY ME-TOO DRUGS   #
    #---------------------------#

    me_too_dir = os.path.join(analysis_dir, 'me_too_drugs')
    create_directory(me_too_dir)
    me_too_drugs_table = os.path.join(me_too_dir, 'me_too_drugs.tsv')
    me_too_drug_combs_table = os.path.join(me_too_dir,
                                           'me_too_drug_combinations.tsv')

    me_too_drug_pairs_file = os.path.join(me_too_dir, 'me_too_drug_pairs.pcl')
    me_too_drug_comb_pairs_file = os.path.join(me_too_dir,
                                               'me_too_drug_comb_pairs.pcl')

    if not fileExist(me_too_drug_pairs_file) or not fileExist(
            me_too_drug_comb_pairs_file):

        df_struc = df[['dcstructure']]
        df_struc = df_struc.astype(float)
        me_too_drug_pairs, me_too_drug_comb_pairs = diana_analysis.obtain_me_too_drugs_and_combinations(
            df_struc, columns, me_too_drugs_table, me_too_drug_combs_table)
        cPickle.dump(me_too_drug_pairs, open(me_too_drug_pairs_file, 'w'))
        cPickle.dump(me_too_drug_comb_pairs,
                     open(me_too_drug_comb_pairs_file, 'w'))

    else:

        me_too_drug_pairs = cPickle.load(open(me_too_drug_pairs_file))
        me_too_drug_comb_pairs = cPickle.load(
            open(me_too_drug_comb_pairs_file))

    # Process me-too drug combination pairs
    me_too_drug_combinations = set()
    drug_pair_to_me_too_times = {}
    for pair in me_too_drug_comb_pairs:
        drug_comb1, drug_comb2 = pair.split('___')
        me_too_drug_combinations.add(frozenset([drug_comb1, drug_comb2]))
        drug_pair_to_me_too_times.setdefault(drug_comb1, 0)
        drug_pair_to_me_too_times.setdefault(drug_comb2, 0)
        drug_pair_to_me_too_times[drug_comb1] += 1
        drug_pair_to_me_too_times[drug_comb2] += 1
    removed_drug_pairs = set()
    for pair in me_too_drug_comb_pairs:
        drug_comb1, drug_comb2 = pair.split('___')
        if drug_comb1 in removed_drug_pairs or drug_comb2 in removed_drug_pairs:
            continue
        if drug_pair_to_me_too_times[drug_comb1] > drug_pair_to_me_too_times[
                drug_comb2]:
            removed_drug_pairs.add(drug_comb1)
        else:
            removed_drug_pairs.add(drug_comb2)

    # Remove the drug pairs which appear in me-too pairs of drug pairs more times
    df = df.loc[~df.index.isin(list(removed_drug_pairs))]

    # Count the number of drug combinations / non-drug combinations
    dc_data = df[df['combination'] == 1]
    ndc_data = df[df['combination'] == 0]
    num_dc = len(dc_data.index)
    num_ndc = len(ndc_data.index)
    print(
        'Number of drug combinations after removing me-too conflictive drug pairs:\t{}\n'
        .format(num_dc))
    print(
        'Number of non-drug combinations after removing me-too conflictive drug pairs:\t{}\n'
        .format(num_ndc))

    #-----------------------------------------------------------#
    #   DIVIDE THE DATASET IN A TRAINING AND A VALIDATION SET   #
    #-----------------------------------------------------------#

    training_dataframe = os.path.join(analysis_dir,
                                      'dcdb_comparisons_training.csv')
    validation_dataframe = os.path.join(analysis_dir,
                                        'dcdb_comparisons_validation.csv')
    proportion_training = 0.8

    # Change the name of the output file if we are doing a comparison with other methods
    if options.comparison_other_methods:
        training_dataframe = os.path.join(
            analysis_dir, 'comparison_other_methods_training.csv')
        validation_dataframe = os.path.join(
            analysis_dir, 'comparison_other_methods_validation.csv')

    if not fileExist(training_dataframe) or not fileExist(
            validation_dataframe):

        num_dc_training = int(round(num_dc * proportion_training))
        num_ndc_training = int(round(num_ndc * proportion_training))
        print('Training set (positives): {} out of {} ({}%)\n'.format(
            num_dc_training, num_dc, proportion_training * 100))
        print('Training set (negatives): {} out of {} ({}%)\n'.format(
            num_ndc_training, num_ndc, proportion_training * 100))

        dc_data_training = dc_data.sample(
            n=num_dc_training)  # Get a random sample
        ndc_data_training = ndc_data.sample(n=num_ndc_training)
        dc_data_validation = dc_data.loc[~dc_data.index.isin(
            dc_data_training.index
        )]  # Remove the sample that we have taken from the dataframe
        ndc_data_validation = ndc_data.loc[~ndc_data.index.
                                           isin(ndc_data_training.index)]

        df_training = pd.concat([dc_data_training, ndc_data_training])
        df_validation = pd.concat([dc_data_validation, ndc_data_validation])

        # Output the Pandas dataframes in a CSV file
        df_training.to_csv(training_dataframe)
        df_validation.to_csv(validation_dataframe)

        # Define the variables for the training dataset
        df = df_training
        dc_data = df[df['combination'] == 1]
        ndc_data = df[df['combination'] == 0]
        num_dc = len(dc_data.index)
        num_ndc = len(ndc_data.index)
        print(
            'Number of drug combinations after getting the training dataset:\t{}\n'
            .format(num_dc))
        print(
            'Number of non-drug combinations after getting the training dataset:\t{}\n'
            .format(num_ndc))

    else:
        df_training = pd.read_csv(training_dataframe, index_col=0)
        df_validation = pd.read_csv(validation_dataframe, index_col=0)

        # Define the variables for the training dataset
        df = df_training
        dc_data = df[df['combination'] == 1]
        ndc_data = df[df['combination'] == 0]
        num_dc = len(dc_data.index)
        num_ndc = len(ndc_data.index)
        print(
            'Number of drug combinations after getting the training dataset:\t{}\n'
            .format(num_dc))
        print(
            'Number of non-drug combinations after getting the training dataset:\t{}\n'
            .format(num_ndc))

    #------------------------------------------------------------------#
    #   SELECT RELEVANT FEATURES / REDUCE DIMENSIONALITY OF THE DATA   #
    #------------------------------------------------------------------#

    if options.pca:

        # Strategy:
        # We calculate the explained variance ratio for all the features.
        # We define a a cut-off threshold of the minimum variance ratio that we consider relevant.
        # We will count the number of features with explained variance higher than the cut-off defined.
        # Then, we will reduce the dimensionality to the number of features with variance higher than the cut-off.

        variance_cut_off = 0.01
        num_components = 0
        df_raw = df.drop('combination', axis=1)
        raw_columns = copy.copy(columns)
        raw_columns.remove('combination')
        pca = PCA(n_components=None)
        pca.fit(df_raw)
        values_trans = pca.transform(df_raw)
        explained_variance = pca.explained_variance_ratio_
        for var in explained_variance:
            if var > variance_cut_off:
                num_components += 1

        if num_components < len(raw_columns):

            print('Number of features:\t{}\n'.format(len(raw_columns)))
            print('Reduction to {} components\n'.format(num_components))

            pca = PCA(n_components=num_components)
            pca.fit(df_raw)
            values_trans = pca.transform(df_raw)
            indexes = df.index.values
            df_trans = pd.DataFrame.from_records(values_trans, index=indexes)
            df_comb = df[['combination']]
            df_new = pd.concat([df_trans, df_comb], axis=1)
            df = df_new
            dc_data = df[df['combination'] == 1]
            ndc_data = df[df['combination'] == 0]
            num_dc = len(dc_data.index)
            num_ndc = len(ndc_data.index)

    else:

        # Strategy:
        # We calculate the variance for each feature

        tables_dir = os.path.join(analysis_dir, 'tables')
        create_directory(tables_dir)
        variance_features_file = os.path.join(tables_dir,
                                              'variance_features.txt')
        variance_cut_off = 0.01

        if not fileExist(variance_features_file):

            df_raw = df.drop('combination', axis=1)
            raw_columns = copy.copy(columns)
            raw_columns.remove('combination')

            from sklearn.feature_selection import VarianceThreshold

            selector = VarianceThreshold(variance_cut_off)
            selector.fit(df_raw)
            variances = selector.variances_
            print(variances)
            new_cols = []
            for x in xrange(len(raw_columns)):
                col = raw_columns[x]
                var = variances[x]
                if var > variance_cut_off:
                    new_cols.append(col)
            #df_new = df[new_cols]
            #print(df_new)
            #print(list(df_new.columns.values))
            with open(variance_features_file, 'w') as variance_fd:
                zipped = zip(raw_columns, variances)
                for col, val in sorted(zipped,
                                       key=lambda (x, y): y,
                                       reverse=True):
                    print(col, val)
                    variance_fd.write('{}\t{}\n'.format(col, val))

        correlation_features_file = os.path.join(tables_dir,
                                                 'correlation_thresholds.txt')
        correlation_scoring_file = os.path.join(tables_dir,
                                                'correlation_scoring.txt')
        correlation_scoring_file_guild = os.path.join(
            tables_dir, 'correlation_scoring_guild.txt')

        if not fileExist(correlation_features_file) or not fileExist(
                correlation_scoring_file):

            df_raw = df.drop('combination', axis=1)
            raw_columns = copy.copy(columns)
            raw_columns.remove('combination')

            from scipy.stats import pearsonr

            comp_to_corr = {}
            for thr1 in threshold_list:
                for thr2 in threshold_list:
                    if thr1 != thr2:
                        for data_type in ['node', 'edge', 'function']:
                            for scoring_function in [
                                    'dot_product', 'spearman', 'jaccard'
                            ]:
                                col1 = 'dcg' + '_' + data_type + '_' + str(
                                    thr1) + '_' + scoring_function
                                col2 = 'dcg' + '_' + data_type + '_' + str(
                                    thr2) + '_' + scoring_function
                                values1 = df_raw[col1]
                                values2 = df_raw[col2]
                                pcorr, pvalue = pearsonr(values1, values2)
                                comp_to_corr[' '.join(
                                    [str(x) for x in sorted([thr1, thr2])]) +
                                             ' ' + data_type + ' ' +
                                             scoring_function] = (pcorr,
                                                                  pvalue)

            with open(correlation_features_file, 'w') as correlation_fd:
                for comp, corr in sorted(comp_to_corr.iteritems(),
                                         key=lambda (x, y): y[0],
                                         reverse=True):
                    print(comp, corr[0])
                    correlation_fd.write('{}\t{}\t{}\n'.format(
                        comp, corr[0], corr[1]))

            comp_to_corr = {}
            for sc1 in ['dot_product', 'spearman', 'jaccard']:
                for sc2 in ['dot_product', 'spearman', 'jaccard']:
                    if sc1 != sc2:
                        for data_type in ['target', 'pfam', 'function']:
                            col1 = 'dct' + '_' + data_type + '_' + sc1
                            col2 = 'dct' + '_' + data_type + '_' + sc2
                            values1 = df_raw[col1]
                            values2 = df_raw[col2]
                            pcorr, pvalue = pearsonr(values1, values2)
                            comp_to_corr[' '.join(sorted([sc1, sc2])) + ' ' +
                                         'targets' + ' ' +
                                         data_type] = (pcorr, pvalue)
                        for method in ['dcatc', 'dcse']:
                            col1 = method + '_' + sc1
                            col2 = method + '_' + sc2
                            values1 = df_raw[col1]
                            values2 = df_raw[col2]
                            pcorr, pvalue = pearsonr(values1, values2)
                            comp_to_corr[' '.join(sorted([sc1, sc2])) + ' ' +
                                         method] = (pcorr, pvalue)

            with open(correlation_scoring_file, 'w') as correlation_fd:
                for comp, corr in sorted(comp_to_corr.iteritems(),
                                         key=lambda (x, y): y[0],
                                         reverse=True):
                    print(comp, corr[0])
                    correlation_fd.write('{}\t{}\t{}\n'.format(
                        comp, corr[0], corr[1]))

            comp_to_corr = {}
            for sc1 in ['dot_product', 'spearman', 'jaccard']:
                for sc2 in ['dot_product', 'spearman', 'jaccard']:
                    if sc1 != sc2:
                        for threshold in threshold_list:
                            for data_type in ['node', 'edge', 'function']:
                                col1 = 'dcg' + '_' + data_type + '_' + str(
                                    threshold) + '_' + sc1
                                col2 = 'dcg' + '_' + data_type + '_' + str(
                                    threshold) + '_' + sc2
                                values1 = df_raw[col1]
                                values2 = df_raw[col2]
                                pcorr, pvalue = pearsonr(values1, values2)
                                comp_to_corr[' '.join(sorted([sc1, sc2])) +
                                             ' ' + str(threshold) + ' ' +
                                             data_type] = (pcorr, pvalue)

            with open(correlation_scoring_file_guild, 'w') as correlation_fd:
                for comp, corr in sorted(comp_to_corr.iteritems(),
                                         key=lambda (x, y): y[0],
                                         reverse=True):
                    print(comp, corr[0])
                    correlation_fd.write('{}\t{}\t{}\n'.format(
                        comp, corr[0], corr[1]))

        forest_features_file = os.path.join(tables_dir,
                                            'forest_importances.txt')

        if not fileExist(forest_features_file):

            X, y = df.iloc[:, :-1], df.iloc[:, -1]
            raw_columns = copy.copy(columns)
            raw_columns.remove('combination')

            from sklearn.ensemble import ExtraTreesClassifier
            from sklearn.datasets import load_iris
            from sklearn.feature_selection import SelectFromModel

            clf = ExtraTreesClassifier()
            clf = clf.fit(X, y)
            importances = clf.feature_importances_

            zipped = zip(raw_columns, importances)
            with open(forest_features_file, 'w') as forest_fd:
                for col, val in sorted(zipped,
                                       key=lambda (x, y): y,
                                       reverse=True):
                    print(col, val)
                    forest_fd.write('{}\t{}\n'.format(col, val))

        # Manually introduced features
        guild_thresholds = [1, 5]
        rank_scoring = ['spearman', 'dot_product']
        list_scoring = ['jaccard']
        selected_columns = diana_analysis.obtain_columns_best_features(
            guild_thresholds, rank_scoring, list_scoring, ATC_SE=consider_se)
        print('Selected columns: {}\n'.format(', '.join(selected_columns)))
        print('Number of selected features: {}\n'.format(
            len(selected_columns) - 1))  # We take away the combinations column

        # Define the new table with the selected columns
        df = df[selected_columns]
        dc_data = df[df['combination'] == 1]
        ndc_data = df[df['combination'] == 0]
        num_dc = len(dc_data.index)
        num_ndc = len(ndc_data.index)

    #--------------------------#
    #   EVALUATE CLASSIFIERS   #
    #--------------------------#

    if options.pca:
        pca_str = '_withPCA'
    else:
        pca_str = '_withoutPCA'

    img_dir = os.path.join(analysis_dir, 'figures')
    create_directory(img_dir)
    fig_format = 'png'
    plot_name = os.path.join(
        img_dir, 'evaluation_classifiers{}.{}'.format(pca_str, fig_format))
    classifiers = {
        'KNeighbors': KNeighborsClassifier(),
        'SVC rbf': SVC(kernel="rbf"),
        'SVC linear': SVC(kernel="linear"),
        'DecisionTree': DecisionTreeClassifier(),
        'RandomForest': RandomForestClassifier(),
        'MLP': MLPClassifier(),
        'AdaBoost': AdaBoostClassifier(),
        'GaussianNB': GaussianNB(),
        'QuadraticDiscr.': QuadraticDiscriminantAnalysis()
    }
    classifiers_order = [
        'AdaBoost', 'DecisionTree', 'GaussianNB', 'KNeighbors', 'MLP',
        'QuadraticDiscr.', 'RandomForest', 'SVC rbf', 'SVC linear'
    ]
    repetitions = 25
    n_fold = 10

    if not fileExist(plot_name):

        print('\nEVALUATION OF THE CLASSIFIERS\n')
        analysis_results = {}
        classifier_to_results = {}

        # Obtain the different non-drug combination groups to repeat the analysis
        ndc_repetitions = diana_analysis.obtain_n_groups_of_k_length(
            ndc_data, repetitions, num_dc
        )  # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times

        for classifier in classifiers:

            print('Classifier: {}\n'.format(classifier))

            mean_aucs = [
            ]  # Here we will store the means of AUCs from the cross-validations
            std_aucs = [
            ]  # Here we will store the standard deviations of the AUCs from the cross-validations
            all_aucs = []  # Here we will store ALL the AUCs

            for ndc_data_equal in ndc_repetitions:

                num_items_group = int(
                    float(num_dc) / float(n_fold)
                )  # Calculate the number of items in each group of the cross-validation

                dc_groups = diana_analysis.obtain_n_groups_of_k_length(
                    dc_data, n_fold, num_items_group, me_too_drug_combinations
                )  # Defining the drug combination groups in each cross-validation step
                ndc_groups = diana_analysis.obtain_n_groups_of_k_length(
                    ndc_data_equal, n_fold, num_items_group,
                    me_too_drug_combinations
                )  # Defining the non-drug combination groups in each cross-validation step
                merged_groups = [
                    pd.concat([x, y]) for x, y in zip(dc_groups, ndc_groups)
                ]

                mean, var, std, list_auc = diana_analysis.run_nfold_crossvalidation_scikit(
                    n_fold, merged_groups, classifiers[classifier])
                mean_aucs.append(mean)
                std_aucs.append(std)
                all_aucs = all_aucs + list_auc

            #final_mean = np.mean(mean_aucs)
            final_mean = np.mean(all_aucs)
            std = np.std(all_aucs)
            mean_std = np.mean(std_aucs)
            std_means = np.std(mean_aucs)
            print('FINAL MEAN: {}'.format(final_mean))
            print('STD: {}\n'.format(std))
            #print('STD of MEANS: {}\n'.format(std_means))
            #print('MEAN of STD: {}'.format(mean_std))

            # Store the distribution of AUCs in the dictionary
            analysis_results[classifier] = all_aucs
            classifier_to_results[classifier] = (final_mean, std)

        print(analysis_results)

        #---------------------------------------------#
        #   PLOT DISTRIBUTION OF AUC PER CLASSIFIER   #
        #---------------------------------------------#

        fig = pylab.figure(dpi=300)
        ax = pylab.axes()
        #pylab.hold(True)
        pos = 1
        col_num = 0

        xticks = []  # Define the places in which the labels will be
        xlabels = []  # Define the labels (the names of the classifiers)
        #colors = [ ['#9ed0ff, blue'], ['#32f232', 'green'], ['#fbc562', '#d48900'], ['#ff7373', '#b80000'], ['grey', 'black'] ]

        for classifier in classifiers_order:

            positions = []
            positions.append(pos)  # Define the positions of the boxplots
            pos += 2  # Add separation between boxplots
            xlabels.append(classifier)  # Add the classifier used at the x axis

            # Boxplot group
            #bp = boxplot(data, positions = positions, widths = 0.6)
            bp = pylab.boxplot(analysis_results[classifier],
                               positions=positions,
                               widths=0.6,
                               patch_artist=True)

            tick = np.mean(
                positions
            )  # The label will be at the mean of the positions (in the middle)
            xticks.append(tick)

        # Set axes limits and labels
        pylab.xlim(0, pos - 1)
        pylab.ylim(0, 1)
        ax.set_xticklabels(xlabels)
        ax.set_xticks(xticks)
        pylab.xlabel('Classifiers')
        pylab.ylabel('Distribution of AUC values')

        fig.autofmt_xdate()
        pylab.savefig(plot_name, format=fig_format)
        pylab.show()

        #---------------------------------#
        #   PRINT THE RESULTS IN A FILE   #
        #---------------------------------#

        tables_dir = os.path.join(analysis_dir, 'tables')
        create_directory(tables_dir)
        output_file = os.path.join(
            tables_dir, 'evaluation_classifiers{}.txt'.format(pca_str))
        with open(output_file, 'w') as output_fd:
            for classifier, results in sorted(
                    classifier_to_results.iteritems(),
                    key=lambda (x, y): y[0],
                    reverse=True):
                output_fd.write('{}\t{}\t{}\n'.format(classifier, results[0],
                                                      results[1]))

    # End marker for time
    end = time.time()
    print(
        '\n  DIANA INFO:\tTIME OF EXECUTION: {:.3f} seconds or {:.3f} minutes.\n'
        .format(end - start, (end - start) / 60))

    return
def analysis_results(options):
    """
    Analyzes the results of the comparisons
    """

    # Start marker for time measure
    start = time.time()

    print("\n\t\t-------------------------------------------------------------------------------------------------------------------------------\n")
    print("\t\tStarting Drug Interactions ANAlysis (DIANA), a program created by @OLIVA'S LAB. Analysis of results: Classify drug combinations\n")
    print("\t\t-------------------------------------------------------------------------------------------------------------------------------\n")

    # Get the script path
    main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
    toolbox_dir = os.path.join(main_path, 'diana/toolbox')

    # Check the directory of the profiles, comparisons and analysis
    data_dir = os.path.join(options.workspace, "profiles")
    check_directory(data_dir)

    results_dir = os.path.join(options.workspace, "comparisons")
    check_directory(results_dir)

    analysis_dir = os.path.join(options.workspace, "analysis")
    check_directory(analysis_dir)

    # Get the list of thresholds to create the profiles
    if options.threshold_list and fileExist(options.threshold_list):
        threshold_list = get_values_from_threshold_file(options.threshold_list)
    else:
        threshold_list = [1, 5, 10, 20, 50]

    # Do we consider Side Effects/ATC?
    if options.consider_se:
        consider_se = True
    else:
        consider_se = False

    # Get the names of the columns
    columns = diana_analysis.obtain_columns(threshold_list, ATC_SE=consider_se)



    #-----------------------------------------------------#
    #   PARSE THE RESULTS AND CREATE A PANDAS DATAFRAME   #
    #-----------------------------------------------------#

    pair2comb_file = os.path.join(toolbox_dir, 'pair2comb.pcl')
    pair2comb = cPickle.load(open(pair2comb_file))

    diana_id_to_drugbank_file = os.path.join(toolbox_dir, 'diana_id_to_drugbank.pcl')
    diana_id_to_drugbank = cPickle.load(open(diana_id_to_drugbank_file))

    ddi = sum(1 for x in pair2comb.values() if x == 1)
    non_ddi = sum(1 for x in pair2comb.values() if x == 0)

    print('NUMBER OF DRUG COMBINATIONS:\t\t{}\n'.format(ddi))
    print('NUMBER OF NON-DRUG COMBINATIONS:\t{}\n'.format(non_ddi))

    output_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons.csv')

    if not fileExist(output_dataframe):

        # Create a data frame to store the results
        df = pd.DataFrame(columns=columns)


        # Obtain all the results subfolders of the results main folder
        results_dir_list = [f for f in os.listdir(results_dir) if os.path.isdir(os.path.join(results_dir, f))]

        for comparison in results_dir_list:

            drug_id1, drug_id2 = comparison.split('---')
            comparison_dir = os.path.join(results_dir, comparison)
            results_table = os.path.join(comparison_dir, 'results_table.tsv')

            # Add the Comb field (if it is drug combination or not)
            drug1 = diana_id_to_drugbank[drug_id1].upper()
            drug2 = diana_id_to_drugbank[drug_id2].upper()
            comparison_without_id = '{}---{}'.format(drug1, drug2)
            if comparison_without_id in pair2comb:
                combination_field = pair2comb[comparison_without_id]
            else:
                print('The comparison {} is not in the pair2comb dictionary!\n'.format(comparison_without_id))
                print(pair2comb)
                sys.exit(10)

            if not fileExist(results_table):
                print('The comparison {} has not been executed properly!\n'.format(comparison))
                sys.exit(10)

            results = diana_analysis.get_results_from_table(results_table, columns, combination_field)

            df2 = pd.DataFrame([results], columns=columns, index=[comparison])
            # Add the information to the main data frame
            df = df.append(df2)

        # Output the Pandas dataframe in a CSV file
        df.to_csv(output_dataframe)

    else:
        df = pd.read_csv(output_dataframe, index_col=0)



    #---------------------------#
    #   REMOVE MISSING VALUES   #
    #---------------------------#

    # Replace the None values in dcstructure by nan
    if 'None' in df['dcstructure']:
        df = df.replace(to_replace={'dcstructure':{'None':np.nan}})

    # Remove the nan values in dcstructure
    df = df.dropna()

    # Count the number of drug combinations / non-drug combinations
    dc_data = df[df['combination'] == 1]
    ndc_data = df[df['combination'] == 0]
    num_dc = len(dc_data.index)
    num_ndc = len(ndc_data.index)
    print('Number of drug combinations after removing missing values:\t{}\n'.format(num_dc))
    print('Number of non-drug combinations after removing missing values:\t{}\n'.format(num_ndc))



    #---------------------------#
    #   IDENTIFY ME-TOO DRUGS   #
    #---------------------------#

    me_too_dir = os.path.join(analysis_dir, 'me_too_drugs')
    create_directory(me_too_dir)
    me_too_drugs_table = os.path.join(me_too_dir, 'me_too_drugs.tsv')
    me_too_drug_combs_table = os.path.join(me_too_dir, 'me_too_drug_combinations.tsv')

    me_too_drug_pairs_file = os.path.join(me_too_dir, 'me_too_drug_pairs.pcl')
    me_too_drug_comb_pairs_file = os.path.join(me_too_dir, 'me_too_drug_comb_pairs.pcl')

    if not fileExist(me_too_drug_pairs_file) or not fileExist(me_too_drug_comb_pairs_file):

        df_struc = df[['dcstructure']]
        df_struc = df_struc.astype(float)
        me_too_drug_pairs, me_too_drug_comb_pairs = diana_analysis.obtain_me_too_drugs_and_combinations(df_struc, columns, me_too_drugs_table, me_too_drug_combs_table)
        cPickle.dump(me_too_drug_pairs, open(me_too_drug_pairs_file, 'w'))
        cPickle.dump(me_too_drug_comb_pairs, open(me_too_drug_comb_pairs_file, 'w'))

    else:

        me_too_drug_pairs = cPickle.load(open(me_too_drug_pairs_file))
        me_too_drug_comb_pairs = cPickle.load(open(me_too_drug_comb_pairs_file))

    # Process me-too drug combination pairs
    me_too_drug_combinations = set()
    drug_pair_to_me_too_times = {}
    for pair in me_too_drug_comb_pairs:
        drug_comb1, drug_comb2 = pair.split('___')
        me_too_drug_combinations.add(frozenset([drug_comb1, drug_comb2]))
        drug_pair_to_me_too_times.setdefault(drug_comb1, 0)
        drug_pair_to_me_too_times.setdefault(drug_comb2, 0)
        drug_pair_to_me_too_times[drug_comb1] += 1
        drug_pair_to_me_too_times[drug_comb2] += 1
    removed_drug_pairs = set()
    for pair in me_too_drug_comb_pairs:
        drug_comb1, drug_comb2 = pair.split('___')
        if drug_comb1 in removed_drug_pairs or drug_comb2 in removed_drug_pairs:
            continue
        if drug_pair_to_me_too_times[drug_comb1] > drug_pair_to_me_too_times[drug_comb2]:
            removed_drug_pairs.add(drug_comb1)
        else:
            removed_drug_pairs.add(drug_comb2)

    # Remove the drug pairs which appear in me-too pairs of drug pairs more times
    df = df.loc[~df.index.isin(list(removed_drug_pairs))]

    # Count the number of drug combinations / non-drug combinations
    dc_data = df[df['combination'] == 1]
    ndc_data = df[df['combination'] == 0]
    num_dc = len(dc_data.index)
    num_ndc = len(ndc_data.index)
    print('Number of drug combinations after removing me-too conflictive drug pairs:\t{}\n'.format(num_dc))
    print('Number of non-drug combinations after removing me-too conflictive drug pairs:\t{}\n'.format(num_ndc))



    img_dir = os.path.join(analysis_dir, 'figures')
    create_directory(img_dir)
    fig_format = 'png'

    #-----------------------------------------------------#
    #   PLOT DISTRIBUTION OF NUMBER OF TARGETS PER DRUG   #
    #-----------------------------------------------------#

    # Plot distribution of comparisons of targets
    drugbank2targets_file = os.path.join(toolbox_dir, 'drugbank_to_targets.pcl')
    drugbank_to_targets = cPickle.load(open(drugbank2targets_file))
    plot_distribution_targets = os.path.join(img_dir, 'distribution_number_targets.{}'.format(fig_format))
    targets = [len(x) for x in drugbank_to_targets.values()]
    n, bins, patches = plt.hist(np.array(targets), bins=50, weights=np.zeros_like(np.array(targets)) + 1. / np.array(targets).size, facecolor='r')
    plt.xlabel('Number of targets per drug')
    plt.ylabel('Relative frequency')
    plt.title('Distribution of the number of targets per drug')
    plt.savefig(plot_distribution_targets, format=fig_format, dpi=300)
    plt.clf()

    #----------------------------------------------------------------------------------------------#
    #   EVALUATE OVERLAP BETWEEN TARGETS, BIOLOGICAL PROCESSES AND PATHWAYS IN DRUG COMBINATIONS   #
    #----------------------------------------------------------------------------------------------#

    tables_dir = os.path.join(analysis_dir, 'tables')
    create_directory(tables_dir)

    if options.formula != 'jaccard' and options.formula != 'simpson':
        print('Please, introduce a correct formula to classify drug combinations: jaccard or simpson!\n')
        sys.exit(10)

    # Plot of distribution of comparisons of Targets
    plot_ji_targets = os.path.join(img_dir, 'distribution_{}_index_targets.{}'.format(options.formula, fig_format))

    # Plot of distribution of comparisons of Biological Processes
    plot_ji_bp = os.path.join(img_dir, 'distribution_{}_index_biological_processes.{}'.format(options.formula, fig_format))

    # Plot of distribution of comparisons of Pathways
    plot_ji_pathways = os.path.join(img_dir, 'distribution_{}_index_pathways.{}'.format(options.formula, fig_format))

    # Output pickle file of the classification
    classification_targets_bp_file = os.path.join(toolbox_dir, 'classification_targets_bp.pcl')
    classification_targets_pathways_file = os.path.join(toolbox_dir, 'classification_targets_pathways.pcl')


    # Get the classification files
    drug_int_2_drugs_file = os.path.join(toolbox_dir, 'drug_int_2_drugs.pcl')
    drug_int_2_drugs = cPickle.load(open(drug_int_2_drugs_file))
    drug_int_2_info_file = os.path.join(toolbox_dir, 'drug_int_2_info.pcl')
    drug_int_2_info = cPickle.load(open(drug_int_2_info_file))
    drugbank_to_dcdb_file = os.path.join(toolbox_dir, 'drugbank_to_dcdb.pcl')
    drugbank_to_dcdb = cPickle.load(open(drugbank_to_dcdb_file))

    bio_processes_file = os.path.join(toolbox_dir, 'target_to_bio_processes.pcl')
    target_to_bio_processes = cPickle.load(open(bio_processes_file))
    pathways_file = os.path.join(toolbox_dir, 'target_to_pathways.pcl')
    target_to_pathways = cPickle.load(open(pathways_file))


    target_comparisons = []
    bp_comparisons = []
    pathway_comparisons = []

    dc_to_target_ji = {}
    dc_to_bp_ji = {}
    dc_to_pathway_ji = {}
    all_drugs = set()

    for index, row in dc_data.iterrows():

        (drug_id1, drug_id2) = index.split('---')
        drug1 = diana_id_to_drugbank[drug_id1].upper()
        drug2 = diana_id_to_drugbank[drug_id2].upper()
        all_drugs.add(drug1)
        all_drugs.add(drug2)


        if drug1 in drugbank_to_targets and drug2 in drugbank_to_targets:
            targets1 = drugbank_to_targets[drug1]
            targets2 = drugbank_to_targets[drug2]
            if options.formula == 'jaccard':
                result_targets = diana_comparison.calculate_jaccard_index(targets1, targets2)
            elif options.formula == 'simpson':
                result_targets = diana_comparison.calculate_simpson_index(targets1, targets2)
            target_comparisons.append(result_targets)
            dc_to_target_ji[index] = result_targets

            bio_proc1 = get_results_from_dict_of_sets(targets1, target_to_bio_processes)
            bio_proc2 = get_results_from_dict_of_sets(targets2, target_to_bio_processes)
            if options.formula == 'jaccard':
                result_bp = diana_comparison.calculate_jaccard_index(bio_proc1, bio_proc2)
            elif options.formula == 'simpson':
                result_bp = diana_comparison.calculate_simpson_index(bio_proc1, bio_proc2)
            bp_comparisons.append(result_bp)
            dc_to_bp_ji[index] = result_bp

            pathways1 = get_results_from_dict_of_sets(targets1, target_to_pathways)
            pathways2 = get_results_from_dict_of_sets(targets2, target_to_pathways)
            if options.formula == 'jaccard':
                result_pathways = diana_comparison.calculate_jaccard_index(pathways1, pathways2)
            elif options.formula == 'simpson':
                result_pathways = diana_comparison.calculate_simpson_index(pathways1, pathways2)
            pathway_comparisons.append(result_pathways)
            dc_to_pathway_ji[index] = result_pathways

    # Plot distribution of comparisons of targets
    n, bins, patches = plt.hist(np.array(target_comparisons), bins=50, weights=np.zeros_like(np.array(target_comparisons)) + 1. / np.array(target_comparisons).size, facecolor='r')
    plt.xlabel('{} Index of Targets'.format(options.formula.capitalize()))
    plt.ylabel('Relative frequency')
    plt.title('Distribution of {} Index of Targets in drug combinations'.format(options.formula.capitalize()))
    plt.savefig(plot_ji_targets, format=fig_format, dpi=300)
    plt.clf()

    # Plot distribution of comparisons of biological processes
    n, bins, patches = plt.hist(np.array(bp_comparisons), bins=50, weights=np.zeros_like(np.array(bp_comparisons)) + 1. / np.array(bp_comparisons).size, facecolor='b')
    plt.xlabel('{} Index of Biological Processes'.format(options.formula.capitalize()))
    plt.ylabel('Relative frequency')
    plt.title('Distribution of {} Index of Biological Processes in drug combinations'.format(options.formula.capitalize()))
    plt.savefig(plot_ji_bp, format=fig_format, dpi=300)
    plt.clf()

    # Plot distribution of comparisons of pathways
    n, bins, patches = plt.hist(np.array(pathway_comparisons), bins=50, weights=np.zeros_like(np.array(pathway_comparisons)) + 1. / np.array(pathway_comparisons).size, facecolor='g')
    plt.xlabel('{} Index of Pathways'.format(options.formula.capitalize()))
    plt.ylabel('Relative frequency')
    plt.title('Distribution of {} Index of Pathways in drug combinations'.format(options.formula.capitalize()))
    plt.savefig(plot_ji_pathways, format=fig_format, dpi=300)
    plt.clf()


    #------------------------------------#
    #   CLASSIFY THE DRUG COMBINATIONS   #
    #------------------------------------#

    # Similar targets   --> ji >  0.25
    # Different targets --> ji <= 0.25
    target_cut_off = 0.5

    # Similar biological processes   --> ji >= 0.25
    # Different biological processes --> ji <  0.25
    bp_cut_off = 0.5

    # Similar pathways   --> ji >= 0.5
    # Different pathways --> ji < 0.5
    pathway_cut_off = 0.5

    classification_tar_bp = {}

    st = 0
    dt = 0
    st_sbp = 0
    st_dbp = 0
    dt_sbp = 0
    dt_dbp = 0
    for dc in dc_to_target_ji:

        # Classify by targets and biological processes
        if dc in dc_to_bp_ji:

            ji_tar = dc_to_target_ji[dc]
            ji_bp = dc_to_bp_ji[dc]

            if ji_tar > target_cut_off:
                classification_tar_bp[dc] = 'similar_targets'
                st += 1
                if ji_bp > bp_cut_off:
                    st_sbp += 1
                elif ji_bp <= bp_cut_off:
                    st_dbp += 1
            elif ji_tar <= target_cut_off:
                dt += 1
                if ji_bp > bp_cut_off:
                    dt_sbp += 1
                    classification_tar_bp[dc] = 'different_targets_similar_bp'
                elif ji_bp <= bp_cut_off:
                    dt_dbp += 1
                    classification_tar_bp[dc] = 'different_targets_different_bp'

    print('Similar targets {}: similar bp {}, diff bp {}\n'.format(st, st_sbp, st_dbp))
    print('Different targets {}: similar bp {}, diff bp {}\n'.format(dt, dt_sbp, dt_dbp))

    cPickle.dump(classification_tar_bp, open(classification_targets_bp_file, 'w'))

    classification_tar_pathway = {}

    st = 0
    dt = 0
    st_spath = 0
    st_dpath = 0
    dt_spath = 0
    dt_dpath = 0
    for dc in dc_to_target_ji:

        # Classify by targets and biological processes
        if dc in dc_to_pathway_ji:

            ji_tar = dc_to_target_ji[dc]
            ji_path = dc_to_pathway_ji[dc]

            if ji_tar > target_cut_off:
                classification_tar_pathway[dc] = 'similar_targets'
                st += 1
                if ji_path > pathway_cut_off:
                    st_spath += 1
                elif ji_path <= pathway_cut_off:
                    st_dpath += 1
            elif ji_tar <= target_cut_off:
                dt += 1
                if ji_path > pathway_cut_off:
                    dt_spath += 1
                    classification_tar_pathway[dc] = 'different_targets_similar_pathways'
                elif ji_path <= pathway_cut_off:
                    dt_dpath += 1
                    classification_tar_pathway[dc] = 'different_targets_different_pathways'

    print('Similar targets {}: similar pathways {}, diff pathways {}\n'.format(st, st_spath, st_dpath))
    print('Different targets {}: similar pathways {}, diff pathways {}\n'.format(dt, dt_spath, dt_dpath))

    cPickle.dump(classification_tar_pathway, open(classification_targets_pathways_file, 'w'))


    # Get number of drugs in drug combinations per number of targets
    targets = [len(drugbank_to_targets[drug]) for drug in drugbank_to_targets if drug in all_drugs]
    numtargets_to_numdrugs = {}
    for target in targets:
        numtargets_to_numdrugs.setdefault(target, 0)
        numtargets_to_numdrugs[target] += 1

    print('Number of drugs in drug combination: {}. Divided by four: {}'.format(len(all_drugs), len(all_drugs)/4))
    for numtar, numdrug in sorted(numtargets_to_numdrugs.iteritems(), key=lambda (x, y): x, reverse = True):
        print(numtar, numdrug)

    # End marker for time
    end = time.time()
    print('\n  DIANA INFO:\tTIME OF EXECUTION: {:.3f} seconds or {:.3f} minutes.\n'.format(end - start, (end - start) / 60))



    return
Example #5
0
def analysis_results(options):
    """
    Analyzes the results of the comparisons
    """

    # Start marker for time measure
    start = time.time()

    print("\n\t\t------------------------------------------------------------------------------------------------------------------------\n")
    print("\t\tStarting Drug Interactions ANAlysis (DIANA), a program created by @OLIVA'S LAB. Analysis of results: Analysis by targets\n")
    print("\t\t------------------------------------------------------------------------------------------------------------------------\n")

    # Get the script path
    main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
    toolbox_dir = os.path.join(main_path, 'diana/toolbox')

    # Check the directory of the profiles, comparisons and analysis
    data_dir = os.path.join(options.workspace, "profiles")
    check_directory(data_dir)

    results_dir = os.path.join(options.workspace, "comparisons")
    check_directory(results_dir)

    analysis_dir = os.path.join(options.workspace, "analysis")
    check_directory(analysis_dir)

    # Get the list of thresholds to create the profiles
    if options.threshold_list and fileExist(options.threshold_list):
        threshold_list = get_values_from_threshold_file(options.threshold_list)
    else:
        threshold_list = [1, 5, 10, 20, 50]

    # Do we consider Side Effects/ATC?
    if options.consider_se:
        consider_se = True
    else:
        consider_se = False

    # Get the names of the columns
    columns = diana_analysis.obtain_columns(threshold_list, ATC_SE=consider_se)



    #-----------------------------------------------------#
    #   PARSE THE RESULTS AND CREATE A PANDAS DATAFRAME   #
    #-----------------------------------------------------#

    pair2comb_file = os.path.join(toolbox_dir, 'pair2comb.pcl')
    pair2comb = cPickle.load(open(pair2comb_file))

    ddi = sum(1 for x in pair2comb.values() if x == 1)
    non_ddi = sum(1 for x in pair2comb.values() if x == 0)

    print('NUMBER OF DRUG COMBINATIONS:\t\t{}\n'.format(ddi))
    print('NUMBER OF NON-DRUG COMBINATIONS:\t{}\n'.format(non_ddi))

    output_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons.csv')

    if not fileExist(output_dataframe):

        # Create a data frame to store the results
        df = pd.DataFrame(columns=columns)


        # Obtain all the results subfolders of the results main folder
        results_dir_list = [f for f in os.listdir(results_dir) if os.path.isdir(os.path.join(results_dir, f))]

        for comparison in results_dir_list:

            drug_id1, drug_id2 = comparison.split('---')
            comparison_dir = os.path.join(results_dir, comparison)
            results_table = os.path.join(comparison_dir, 'results_table.tsv')

            # Add the Comb field (if it is drug combination or not)
            drug1 = drug_id1.split('_')[0].upper()
            drug2 = drug_id2.split('_')[0].upper()
            comparison_without_id = '{}---{}'.format(drug1, drug2)
            if comparison_without_id in pair2comb:
                combination_field = pair2comb[comparison_without_id]
            else:
                print('The comparison {} is not in the pair2comb dictionary!\n'.format(comparison_without_id))
                print(pair2comb)
                sys.exit(10)

            if not fileExist(results_table):
                print('The comparison {} has not been executed properly!\n'.format(comparison))
                sys.exit(10)

            results = get_results_from_table(results_table, columns, combination_field)

            df2 = pd.DataFrame([results], columns=columns, index=[comparison])
            # Add the information to the main data frame
            df = df.append(df2)

        # Output the Pandas dataframe in a CSV file
        df.to_csv(output_dataframe)

    else:
        df = pd.read_csv(output_dataframe, index_col=0)



    #---------------------------#
    #   REMOVE MISSING VALUES   #
    #---------------------------#

    # Replace the None values in dcstructure by nan
    if 'None' in df['dcstructure']:
        df = df.replace(to_replace={'dcstructure':{'None':np.nan}})

    # Remove the nan values in dcstructure
    df = df.dropna()

    # Count the number of drug combinations / non-drug combinations
    dc_data = df[df['combination'] == 1]
    ndc_data = df[df['combination'] == 0]
    num_dc = len(dc_data.index)
    num_ndc = len(ndc_data.index)
    print('Number of drug combinations after removing missing values:\t{}\n'.format(num_dc))
    print('Number of non-drug combinations after removing missing values:\t{}\n'.format(num_ndc))



    #---------------------------#
    #   IDENTIFY ME-TOO DRUGS   #
    #---------------------------#

    me_too_dir = os.path.join(analysis_dir, 'me_too_drugs')
    create_directory(me_too_dir)
    me_too_drugs_table = os.path.join(me_too_dir, 'me_too_drugs.tsv')
    me_too_drug_combs_table = os.path.join(me_too_dir, 'me_too_drug_combinations.tsv')

    me_too_drug_pairs_file = os.path.join(me_too_dir, 'me_too_drug_pairs.pcl')
    me_too_drug_comb_pairs_file = os.path.join(me_too_dir, 'me_too_drug_comb_pairs.pcl')

    if not fileExist(me_too_drug_pairs_file) or not fileExist(me_too_drug_comb_pairs_file):

        df_struc = df[['dcstructure']]
        df_struc = df_struc.astype(float)
        me_too_drug_pairs, me_too_drug_comb_pairs = diana_analysis.obtain_me_too_drugs_and_combinations(df_struc, columns, me_too_drugs_table, me_too_drug_combs_table)
        cPickle.dump(me_too_drug_pairs, open(me_too_drug_pairs_file, 'w'))
        cPickle.dump(me_too_drug_comb_pairs, open(me_too_drug_comb_pairs_file, 'w'))

    else:

        me_too_drug_pairs = cPickle.load(open(me_too_drug_pairs_file))
        me_too_drug_comb_pairs = cPickle.load(open(me_too_drug_comb_pairs_file))

    # Process me-too drug combination pairs
    me_too_drug_combinations = set()
    drug_pair_to_me_too_times = {}
    for pair in me_too_drug_comb_pairs:
        drug_comb1, drug_comb2 = pair.split('___')
        me_too_drug_combinations.add(frozenset([drug_comb1, drug_comb2]))
        drug_pair_to_me_too_times.setdefault(drug_comb1, 0)
        drug_pair_to_me_too_times.setdefault(drug_comb2, 0)
        drug_pair_to_me_too_times[drug_comb1] += 1
        drug_pair_to_me_too_times[drug_comb2] += 1
    removed_drug_pairs = set()
    for pair in me_too_drug_comb_pairs:
        drug_comb1, drug_comb2 = pair.split('___')
        if drug_comb1 in removed_drug_pairs or drug_comb2 in removed_drug_pairs:
            continue
        if drug_pair_to_me_too_times[drug_comb1] > drug_pair_to_me_too_times[drug_comb2]:
            removed_drug_pairs.add(drug_comb1)
        else:
            removed_drug_pairs.add(drug_comb2)

    # Remove the drug pairs which appear in me-too pairs of drug pairs more times
    df = df.loc[~df.index.isin(list(removed_drug_pairs))]

    # Count the number of drug combinations / non-drug combinations
    dc_data = df[df['combination'] == 1]
    ndc_data = df[df['combination'] == 0]
    num_dc = len(dc_data.index)
    num_ndc = len(ndc_data.index)
    print('Number of drug combinations after removing me-too conflictive drug pairs:\t{}\n'.format(num_dc))
    print('Number of non-drug combinations after removing me-too conflictive drug pairs:\t{}\n'.format(num_ndc))



    #-------------------------------------#
    #   EVALUATE PERFORMANCE BY TARGETS   #
    #-------------------------------------#

    img_dir = os.path.join(analysis_dir, 'figures')
    create_directory(img_dir)
    fig_format = 'png'

    tables_dir = os.path.join(analysis_dir, 'tables')
    create_directory(tables_dir)

    # Number of targets
    num_targets = [[1],[2],[3,4,5,6],[7]]

    # Names of the methods
    if consider_se:
        if options.different_atc:
            types_analysis = ['dctargets', 'dcguild', 'dcstructure', 'dcse', 'random']
            types_analysis2 = ['dctargets', 'dcguild', 'dcstructure', 'dcse'] # Without random!!
            #types_analysis_labels = ['dcTargets', 'dcGUILD', 'dcStructure', 'dcSE', 'Random']
            types_analysis_labels = [ 'Target', 'PPI','Structure', 'Side Effects', 'Random']
        else:
            types_analysis = ['dctargets', 'dcguild', 'dcstructure', 'dcatc', 'dcse', 'random']
            types_analysis2 = ['dctargets', 'dcguild', 'dcstructure', 'dcatc', 'dcse'] # Without random!!
            #types_analysis_labels = ['dcTargets', 'dcGUILD', 'dcStructure', 'dcATC', 'dcSE', 'Random']
            types_analysis_labels = [ 'Target', 'PPI','Structure', 'ATC', 'Side Effects', 'Random']
    else:
        types_analysis = ['dctargets', 'dcguild', 'dcstructure', 'random']
        types_analysis2 = ['dctargets', 'dcguild', 'dcstructure'] # Without random!!
        types_analysis_labels = ['dcTargets', 'dcGUILD', 'dcStructure', 'Random']
        types_analysis_labels = [ 'Target', 'PPI','Structure', 'Random']


    # Machine learning parameters
    repetitions = 25 # Number of repetititons
    n_fold = 2     # Number of folds
    min_num_dc_group = 10
    greater_or_smaller = 'greater'
    classifier = 'SVC best 1'
    classifiers = {
        'KNeighbors' : KNeighborsClassifier(3),
        'SVC' : SVC(probability=True),
        'SVC linear' : SVC(kernel="linear", C=0.025),
        'SVC rbf' : SVC(gamma=2, C=1),
        'DecisionTree' : DecisionTreeClassifier(max_depth=5),
        'RandomForest' : RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        'MLP' : MLPClassifier(alpha=1),
        'AdaBoost' : AdaBoostClassifier(),
        'GaussianNB' : GaussianNB(),
        'QuadraticDiscr.' : QuadraticDiscriminantAnalysis(),
        'SVC best 1' : SVC(kernel="rbf", gamma=0.01, C=100, probability=True),
        'SVC best 2' : SVC(kernel="rbf", gamma=0.1, C=1.0, probability=True)
    }

    if options.pca:
        pca_str = '_withPCA'
    else:
        pca_str = '_withoutPCA'

    # Plot of distributions of AUC
    plot_auc_distribution = os.path.join(img_dir, 'numtargets_auc_distribution_ranges{}.{}'.format(pca_str, fig_format))

    # Plot of accuracy/sensitivity name
    acc_sens_dctargets = os.path.join(img_dir, 'numtargets_accsens_dctargets_ranges{}.{}'.format(pca_str, fig_format))
    acc_sens_dcguild = os.path.join(img_dir, 'numtargets_accsens_dcguild_ranges{}.{}'.format(pca_str, fig_format))
    acc_sens_dcstructure = os.path.join(img_dir, 'numtargets_accsens_dcstructure_ranges{}.{}'.format(pca_str, fig_format))
    acc_sens_dcatc = os.path.join(img_dir, 'numtargets_accsens_dcatc_ranges{}.{}'.format(pca_str, fig_format))
    acc_sens_dcse = os.path.join(img_dir, 'numtargets_accsens_dcse_ranges{}.{}'.format(pca_str, fig_format))

    # Results table
    results_table = os.path.join(tables_dir, 'numtargets_auc_table_ranges{}.txt'.format(pca_str))

    # Accuracy/Sensitivity results table
    prec_rec_table = os.path.join(tables_dir, 'numtargets_accsens_table_ranges{}.txt'.format(pca_str))

    # File with results of Mann Whitney tests
    mannwhitney_file = os.path.join(tables_dir, 'numtargets_mannwhitney_ranges{}.txt'.format(pca_str))

    # Get the targets file
    drugbank_to_targets_file = os.path.join(toolbox_dir, 'drugbank_to_targets.pcl')
    drugbank_to_targets = cPickle.load(open(drugbank_to_targets_file))

    # Get the DIANA IDs file
    diana_id_to_drugbank_file = os.path.join(toolbox_dir, 'diana_id_to_drugbank.pcl')
    diana_id_to_drugbank = cPickle.load(open(diana_id_to_drugbank_file))


    analysis_results = {} # Defining the dictionary that will store the results

    if consider_se:
        dct_columns, dcg_columns, dcs_columns, dcatc_columns, dcse_columns = diana_analysis.obtain_method_to_columns(threshold_list, ATC_SE=consider_se)
    else:
        dct_columns, dcg_columns, dcs_columns = diana_analysis.obtain_method_to_columns(threshold_list, ATC_SE=consider_se)

    for range_tar in num_targets:

        selected_rows = []

        for index, row in df.iterrows():

            (drug_id1, drug_id2) = index.split('---')
            drug1 = diana_id_to_drugbank[drug_id1].upper()
            drug2 = diana_id_to_drugbank[drug_id2].upper()

            if len(range_tar) == 1:
                # If it is the first of the range
                if range_tar == num_targets[0]:
                    if len(drugbank_to_targets[drug1]) <= range_tar[0] and len(drugbank_to_targets[drug2]) <= range_tar[0]:
                        selected_rows.append(index)
                # If it is the last of the range
                elif range_tar == num_targets[len(num_targets)-1]:
                    if len(drugbank_to_targets[drug1]) >= range_tar[0] and len(drugbank_to_targets[drug2]) >= range_tar[0]:
                        selected_rows.append(index)
                # If it is in the middle of the range
                else:
                    if len(drugbank_to_targets[drug1]) == range_tar[0] and len(drugbank_to_targets[drug2]) == range_tar[0]:
                        selected_rows.append(index)
            else:
                if len(drugbank_to_targets[drug1]) in range_tar and len(drugbank_to_targets[drug2]) in range_tar:
                    selected_rows.append(index)


        df_tar = df.ix[selected_rows]
        dc_data = df_tar[df_tar['combination'] == 1]
        num_dc = len(dc_data.index)
        print('Num drug combinations: {}'.format(num_dc))

        if consider_se:
            list_methods = [ ['dctargets', dct_columns], ['dcguild', dcg_columns], ['dcstructure', dcs_columns], ['dcatc', dcatc_columns], ['dcse', dcse_columns], ['random', columns] ]
        else:
            list_methods = [ ['dctargets', dct_columns], ['dcguild', dcg_columns], ['dcstructure', dcs_columns], ['random', columns] ]

        for method, columns_method in list_methods:

            print('Evaluating {} targets with method {}\n'.format(range_tar,method))

            #------------------------------------------------------------------#
            #   SELECT RELEVANT FEATURES / REDUCE DIMENSIONALITY OF THE DATA   #
            #------------------------------------------------------------------#

            if options.pca:

                variance_cut_off = 0.01
                num_components = 0
                df_method = df_tar[columns_method]
                df_raw = df_method.drop('combination', axis=1)
                raw_columns = copy.copy(columns_method)
                raw_columns.remove('combination')
                pca = PCA(n_components=None)
                pca.fit(df_raw)
                values_trans = pca.transform(df_raw)
                explained_variance = pca.explained_variance_ratio_
                for column, var in sorted(zip(raw_columns, explained_variance), key=lambda x: x[1], reverse=True):
                    #print(column, var)
                    if var > variance_cut_off:
                        num_components += 1

                if num_components < len(raw_columns):

                    print('Number of features:\t{}\n'.format(len(raw_columns)))
                    print('Reduction to {} components\n'.format(num_components))

                    pca = PCA(n_components=num_components)
                    pca.fit(df_raw)
                    values_trans = pca.transform(df_raw)
                    indexes = df_method.index.values
                    df_trans = pd.DataFrame.from_records(values_trans, index=indexes)
                    df_comb = df_method[['combination']]
                    df_new = pd.concat([df_trans, df_comb], axis=1)
                    df_method = df_new

            else:

                # Manually introduced features
                guild_thresholds = [1, 5]
                rank_scoring = ['spearman', 'dot_product']
                list_scoring = ['jaccard']
                if method == 'Combination' or method == 'random':
                    selected_columns = diana_analysis.obtain_columns_best_features(guild_thresholds, rank_scoring, list_scoring, ATC_SE=consider_se)
                else:
                    selected_columns = diana_analysis.obtain_columns_best_features_for_specific_method(method, guild_thresholds, rank_scoring, list_scoring)

                # Remove ATC columns if different ATC
                if options.different_atc and consider_se:
                    selected_columns = [col for col in selected_columns if col not in dcatc_columns or col == 'combination']

                print('Selected columns: {}\n'.format(', '.join(selected_columns)))
                print('Number of selected features: {}\n'.format(len(selected_columns)-1)) # We take away the combinations column

                # Define the new table with the selected columns
                df_method = df_tar[selected_columns]
                dc_data = df_method[df_method['combination'] == 1]
                ndc_data = df_method[df_method['combination'] == 0]
                num_dc = len(dc_data.index)
                num_ndc = len(ndc_data.index)

            #------------------------------------------------------------------#


            dc_data = df_method[df_method['combination'] == 1]
            ndc_data = df_method[df_method['combination'] == 0]
            num_dc = len(dc_data.index)
            num_ndc = len(ndc_data.index)

            print('Building {} repetition groups of {} (same) DC and {} (different) non-DC'.format(repetitions,num_dc,num_dc))
            ndc_repetitions = diana_analysis.obtain_n_groups_of_k_length(ndc_data, repetitions, num_dc) # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times

            mean_aucs = [] # Here we will store the means of AUCs from the cross-validations
            std_aucs = [] # Here we will store the standard deviations of the AUCs from the cross-validations
            all_aucs = [] # Here we will store ALL the AUCs
            all_probs = [] # Here we store all the probabilities and labels

            num_repetitions=0
            for ndc_data_equal in ndc_repetitions:

                num_repetitions+=1
                num_items_group = int( float(num_dc) / float(n_fold) ) # Calculate the number of items in each group of the cross-validation
                if num_repetitions == 1:
                    print('Building {} fold groups of {} DC and {} non-DC x {} repetitions'.format(n_fold,num_items_group,num_items_group, repetitions))

                dc_groups = diana_analysis.obtain_n_groups_of_k_length(dc_data, n_fold, num_items_group, me_too_drug_combinations) # Defining the drug combination groups in each cross-validation step
                ndc_groups = diana_analysis.obtain_n_groups_of_k_length(ndc_data_equal, n_fold, num_items_group, me_too_drug_combinations) # Defining the non-drug combination groups in each cross-validation step
                merged_groups = [pd.concat([x,y]) for x,y in zip(dc_groups, ndc_groups)]

                if method == 'random':
                    #mean, var, std, list_auc = run_nfold_crossvalidation_random(n_fold, merged_groups, classifiers[classifier])
                    mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_dummy(n_fold, merged_groups, classifiers[classifier])
                else:
                    mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_scikit_with_prob(n_fold, merged_groups, classifiers[classifier])

                mean_aucs.append(mean)
                std_aucs.append(std)
                all_aucs = all_aucs + list_auc
                all_probs = all_probs + list_prob

            final_mean = np.mean(all_aucs)
            #final_mean = np.mean(mean_aucs)
            std = np.std(all_aucs)
            mean_std = np.mean(std_aucs)
            std_means = np.std(mean_aucs)
            print('FINAL MEAN: {}'.format(final_mean))
            print('STD: {}\n'.format(std))
            #print('MEAN of STD: {}'.format(mean_std))

            # Store the distribution of AUCs in the dictionary
            analysis_results.setdefault(range_tar[0], {})
            analysis_results[range_tar[0]].setdefault(method, {})
            analysis_results[range_tar[0]][method]['all_aucs'] = all_aucs
            analysis_results[range_tar[0]][method]['all_probs'] = all_probs
            analysis_results[range_tar[0]][method]['mean'] = final_mean
            analysis_results[range_tar[0]][method]['std'] = std
            analysis_results[range_tar[0]][method]['num_dc'] = num_dc


    #------------------------------------#
    #   PLOT PRECISION VS. SENSITIVITY   #
    #------------------------------------#

    analysis_results = plot_precision_sensitivity(analysis_results, 'dctargets', num_targets, acc_sens_dctargets)
    analysis_results = plot_precision_sensitivity(analysis_results, 'dcguild', num_targets, acc_sens_dcguild)
    analysis_results = plot_precision_sensitivity(analysis_results, 'dcstructure', num_targets, acc_sens_dcstructure)
    if consider_se:
        analysis_results = plot_precision_sensitivity(analysis_results, 'dcatc', num_targets, acc_sens_dcatc)
        analysis_results = plot_precision_sensitivity(analysis_results, 'dcse', num_targets, acc_sens_dcse)


    #----------------------------------------------------#
    #   PLOT DISTRIBUTION OF AUC PER NUMBER OF TARGETS   #
    #----------------------------------------------------#

    plot_auc_distributions(analysis_results, num_targets, types_analysis, types_analysis_labels, plot_auc_distribution, fig_format=fig_format, consider_se=consider_se)


    #--------------------------------------------------------#
    #   TABLE OF DISTRIBUTION OF AUC PER NUMBER OF TARGETS   #
    #--------------------------------------------------------#

    with open(results_table, 'w') as results_table_fd:

        # Header
        results_table_fd.write(' ')
        for method in types_analysis_labels:
            results_table_fd.write('\t{}\t \t '.format(method))
        results_table_fd.write('\n')

        for num in num_targets:
            results_table_fd.write('{}'.format(num))
            for method in types_analysis:
                mean = analysis_results[num[0]][method]['mean']
                std = analysis_results[num[0]][method]['std']
                num_dc = analysis_results[num[0]][method]['num_dc']
                results_table_fd.write('\t{}\t{}\t{}'.format(mean, std, num_dc))
            results_table_fd.write('\n')


    #----------------------------------------#
    #   TABLE OF PRECISION VS. SENSITIVITY   #
    #----------------------------------------#

    with open(prec_rec_table, 'w') as prec_rec_table_fd:

        # Header
        prec_rec_table_fd.write(' ')
        for method in types_analysis2:
            prec_rec_table_fd.write('\t{}\t '.format(method))
        prec_rec_table_fd.write('\n')

        for num in num_targets:
            prec_rec_table_fd.write('{}'.format(num))
            for method in types_analysis2:
                cut_off = analysis_results[num[0]][method]['cut_off']
                value = analysis_results[num[0]][method]['value']
                prec_rec_table_fd.write('\t{}\t{}'.format(cut_off, value))
            prec_rec_table_fd.write('\n')


    #-------------------------------------------------------------------#
    #   TABLE OF COMPARISON OF AUC DISTRIBUTIONS USING MANN WHITNEY U   #
    #-------------------------------------------------------------------#

    with open(mannwhitney_file, 'w') as mannwhitney_fd:

        mann_results = {}

        mannwhitney_fd.write(' \t ')
        for method in types_analysis_labels:
            mannwhitney_fd.write('\t{}'.format(method))
        mannwhitney_fd.write('\n')

        # Perform the comparisons
        for num in num_targets:
            mann_results.setdefault(num[0], {})
            for method1 in types_analysis:
                mann_results[num[0]].setdefault(method1, {})
                for method2 in types_analysis:
                    if method1 == method2:
                        mann_results[num[0]][method1][method2] = '-'
                    else:
                        method1_dist = analysis_results[num[0]][method1]['all_aucs']
                        method2_dist = analysis_results[num[0]][method2]['all_aucs']
                        stat, pval = scipy.stats.mannwhitneyu(method1_dist, method2_dist)
                        mann_results[num[0]][method1][method2] = [stat, pval]

        # Write the table of crossings
        for num in num_targets:
            for method1 in types_analysis:
                mannwhitney_fd.write('{}\t{}'.format(num[0], method1))
                for method2 in types_analysis:
                    if method1 == method2:
                        mannwhitney_fd.write('\t-')
                    else:
                        stat, pval = mann_results[num[0]][method1][method2]
                        mannwhitney_fd.write('\t{}, {:.2e}'.format(stat,pval))
                mannwhitney_fd.write('\n')




    # End marker for time
    end = time.time()
    print('\n  DIANA INFO:\tTIME OF EXECUTION: {:.3f} seconds or {:.3f} minutes.\n'.format(end - start, (end - start) / 60))



    return
Example #6
0
def analysis_results(options):
    """
    Analyzes the results of the comparisons
    """

    # Start marker for time measure
    start = time.time()

    print(
        "\n\t\t------------------------------------------------------------------------------------------------------------------------\n"
    )
    print(
        "\t\tStarting Drug Interactions ANAlysis (DIANA), a program created by @OLIVA'S LAB. Analysis of results: Analysis by targets\n"
    )
    print(
        "\t\t------------------------------------------------------------------------------------------------------------------------\n"
    )

    # Get the script path
    main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
    toolbox_dir = os.path.join(main_path, 'diana/toolbox')

    # Check the directory of the profiles, comparisons and analysis
    data_dir = os.path.join(options.workspace, "profiles")
    check_directory(data_dir)

    results_dir = os.path.join(options.workspace, "comparisons")
    check_directory(results_dir)

    analysis_dir = os.path.join(options.workspace, "analysis")
    check_directory(analysis_dir)

    # Create a directory for the analysis of the comparison with other methods
    if options.comparison_other_methods:
        analysis_dir = os.path.join(options.workspace, "analysis_comparison")
        create_directory(analysis_dir)

    # Get the list of thresholds to create the profiles
    if options.threshold_list and fileExist(options.threshold_list):
        threshold_list = get_values_from_threshold_file(options.threshold_list)
    else:
        threshold_list = [1, 5, 10, 20, 50]

    # Do we consider Side Effects/ATC?
    if options.consider_se:
        consider_se = True
    else:
        consider_se = False

    # Make a cross-validation with the validation set (True)
    # or make a training with the training and a validation with the validation (False)
    cross_validation = False

    # Get the names of the columns
    columns = diana_analysis.obtain_columns(threshold_list, ATC_SE=consider_se)

    #-----------------------------------------------------#
    #   PARSE THE RESULTS AND CREATE A PANDAS DATAFRAME   #
    #-----------------------------------------------------#

    pair2comb_file = os.path.join(toolbox_dir, 'pair2comb.pcl')
    pair2comb = cPickle.load(open(pair2comb_file))

    ddi = sum(1 for x in pair2comb.values() if x == 1)
    non_ddi = sum(1 for x in pair2comb.values() if x == 0)

    print('NUMBER OF DRUG COMBINATIONS:\t\t{}\n'.format(ddi))
    print('NUMBER OF NON-DRUG COMBINATIONS:\t{}\n'.format(non_ddi))

    output_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons.csv')

    # Change the name of the output file if we are doing a comparison with other methods
    if options.comparison_other_methods:
        output_dataframe = os.path.join(analysis_dir,
                                        'comparison_other_methods.csv')

    if not fileExist(output_dataframe):

        # Create a data frame to store the results
        df = pd.DataFrame(columns=columns)

        # Obtain all the results subfolders of the results main folder
        results_dir_list = [
            f for f in os.listdir(results_dir)
            if os.path.isdir(os.path.join(results_dir, f))
        ]

        for comparison in results_dir_list:

            drug_id1, drug_id2 = comparison.split('---')
            comparison_dir = os.path.join(results_dir, comparison)
            results_table = os.path.join(comparison_dir, 'results_table.tsv')

            # Add the Comb field (if it is drug combination or not)
            drug1 = drug_id1.split('_')[0].upper()
            drug2 = drug_id2.split('_')[0].upper()
            comparison_without_id = '{}---{}'.format(drug1, drug2)
            if comparison_without_id in pair2comb:
                combination_field = pair2comb[comparison_without_id]
            else:
                print(
                    'The comparison {} is not in the pair2comb dictionary!\n'.
                    format(comparison_without_id))
                print(pair2comb)
                sys.exit(10)

            if not fileExist(results_table):
                print('The comparison {} has not been executed properly!\n'.
                      format(comparison))
                sys.exit(10)

            results = get_results_from_table(results_table, columns,
                                             combination_field)

            df2 = pd.DataFrame([results], columns=columns, index=[comparison])
            # Add the information to the main data frame
            df = df.append(df2)

        # Output the Pandas dataframe in a CSV file
        df.to_csv(output_dataframe)

    else:
        df = pd.read_csv(output_dataframe, index_col=0)

    #---------------------------#
    #   REMOVE MISSING VALUES   #
    #---------------------------#

    # Replace the None values in dcstructure by nan
    if 'None' in df['dcstructure']:
        df = df.replace(to_replace={'dcstructure': {'None': np.nan}})

    # Remove the nan values in dcstructure
    df = df.dropna()

    # Count the number of drug combinations / non-drug combinations
    dc_data = df[df['combination'] == 1]
    ndc_data = df[df['combination'] == 0]
    num_dc = len(dc_data.index)
    num_ndc = len(ndc_data.index)
    print('Number of drug combinations after removing missing values:\t{}\n'.
          format(num_dc))
    print(
        'Number of non-drug combinations after removing missing values:\t{}\n'.
        format(num_ndc))

    #---------------------------#
    #   IDENTIFY ME-TOO DRUGS   #
    #---------------------------#

    me_too_dir = os.path.join(analysis_dir, 'me_too_drugs')
    create_directory(me_too_dir)
    me_too_drugs_table = os.path.join(me_too_dir, 'me_too_drugs.tsv')
    me_too_drug_combs_table = os.path.join(me_too_dir,
                                           'me_too_drug_combinations.tsv')

    me_too_drug_pairs_file = os.path.join(me_too_dir, 'me_too_drug_pairs.pcl')
    me_too_drug_comb_pairs_file = os.path.join(me_too_dir,
                                               'me_too_drug_comb_pairs.pcl')

    if not fileExist(me_too_drug_pairs_file) or not fileExist(
            me_too_drug_comb_pairs_file):

        df_struc = df[['dcstructure']]
        df_struc = df_struc.astype(float)
        me_too_drug_pairs, me_too_drug_comb_pairs = diana_analysis.obtain_me_too_drugs_and_combinations(
            df_struc, columns, me_too_drugs_table, me_too_drug_combs_table)
        cPickle.dump(me_too_drug_pairs, open(me_too_drug_pairs_file, 'w'))
        cPickle.dump(me_too_drug_comb_pairs,
                     open(me_too_drug_comb_pairs_file, 'w'))

    else:

        me_too_drug_pairs = cPickle.load(open(me_too_drug_pairs_file))
        me_too_drug_comb_pairs = cPickle.load(
            open(me_too_drug_comb_pairs_file))

    # Process me-too drug combination pairs
    me_too_drug_combinations = set()
    drug_pair_to_me_too_times = {}
    for pair in me_too_drug_comb_pairs:
        drug_comb1, drug_comb2 = pair.split('___')
        me_too_drug_combinations.add(frozenset([drug_comb1, drug_comb2]))
        drug_pair_to_me_too_times.setdefault(drug_comb1, 0)
        drug_pair_to_me_too_times.setdefault(drug_comb2, 0)
        drug_pair_to_me_too_times[drug_comb1] += 1
        drug_pair_to_me_too_times[drug_comb2] += 1
    removed_drug_pairs = set()
    for pair in me_too_drug_comb_pairs:
        drug_comb1, drug_comb2 = pair.split('___')
        if drug_comb1 in removed_drug_pairs or drug_comb2 in removed_drug_pairs:
            continue
        if drug_pair_to_me_too_times[drug_comb1] > drug_pair_to_me_too_times[
                drug_comb2]:
            removed_drug_pairs.add(drug_comb1)
        else:
            removed_drug_pairs.add(drug_comb2)

    # Remove the drug pairs which appear in me-too pairs of drug pairs more times
    df = df.loc[~df.index.isin(list(removed_drug_pairs))]

    # Count the number of drug combinations / non-drug combinations
    dc_data = df[df['combination'] == 1]
    ndc_data = df[df['combination'] == 0]
    num_dc = len(dc_data.index)
    num_ndc = len(ndc_data.index)
    print(
        'Number of drug combinations after removing me-too conflictive drug pairs:\t{}\n'
        .format(num_dc))
    print(
        'Number of non-drug combinations after removing me-too conflictive drug pairs:\t{}\n'
        .format(num_ndc))

    #----------------------------#
    #   GET THE VALIDATION SET   #
    #----------------------------#

    training_dataframe = os.path.join(analysis_dir,
                                      'dcdb_comparisons_training.csv')
    validation_dataframe = os.path.join(analysis_dir,
                                        'dcdb_comparisons_validation.csv')
    df_training = pd.read_csv(training_dataframe, index_col=0)
    df_validation = pd.read_csv(validation_dataframe, index_col=0)

    if cross_validation:
        df = df_validation
        dc_data = df[df['combination'] == 1]
        ndc_data = df[df['combination'] == 0]
        num_dc = len(dc_data.index)
        num_ndc = len(ndc_data.index)
        print(
            'Number of drug combinations after getting the validation dataset:\t{}\n'
            .format(num_dc))
        print(
            'Number of non-drug combinations after getting the validation dataset:\t{}\n'
            .format(num_ndc))
    else:
        # Define the variables for the training dataset
        df = df_training
        dc_data = df[df['combination'] == 1]
        ndc_data = df[df['combination'] == 0]
        num_dc = len(dc_data.index)
        num_ndc = len(ndc_data.index)
        print(
            'Number of drug combinations after getting the training dataset:\t{}\n'
            .format(num_dc))
        print(
            'Number of non-drug combinations after getting the training dataset:\t{}\n'
            .format(num_ndc))

        # Define the variables for the validation dataset
        dc_data_val = df_validation[df_validation['combination'] == 1]
        ndc_data_val = df_validation[df_validation['combination'] == 0]
        num_dc_val = len(dc_data_val.index)
        num_ndc_val = len(ndc_data_val.index)
        print(
            'Number of drug combinations after getting the validation dataset:\t{}\n'
            .format(num_dc_val))
        print(
            'Number of non-drug combinations after getting the validation dataset:\t{}\n'
            .format(num_ndc_val))

    #-------------------------#
    #   EVALUATE PERFORMANCE  #
    #-------------------------#

    img_dir = os.path.join(analysis_dir, 'figures')
    create_directory(img_dir)
    fig_format = 'png'

    tables_dir = os.path.join(analysis_dir, 'tables')
    create_directory(tables_dir)

    # Machine learning parameters
    repetitions = 25  # Number of repetititons
    n_fold = 10  # Number of folds
    min_num_dc_group = 10
    classifier = 'SVC best 1'
    classifiers = {
        'KNeighbors':
        KNeighborsClassifier(3),
        'SVC':
        SVC(probability=True),
        'SVC linear':
        SVC(kernel="linear", C=0.025),
        'SVC rbf':
        SVC(gamma=2, C=1),
        'DecisionTree':
        DecisionTreeClassifier(max_depth=5),
        'RandomForest':
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        'MLP':
        MLPClassifier(alpha=1),
        'AdaBoost':
        AdaBoostClassifier(),
        'GaussianNB':
        GaussianNB(),
        'QuadraticDiscr.':
        QuadraticDiscriminantAnalysis(),
        'SVC best 1':
        SVC(kernel="rbf", gamma=0.01, C=100, probability=True),
        'SVC best 2':
        SVC(kernel="rbf", gamma=0.1, C=1.0, probability=True)
    }

    if options.pca:
        pca_str = '_withPCA'
    else:
        pca_str = '_withoutPCA'
    if options.different_atc:
        atc_str = '_diff_ATC'
    else:
        atc_str = ''

    # Plot of distributions of AUC
    plot_name = os.path.join(
        img_dir,
        'general_performance_by_methods{}{}.{}'.format(atc_str, pca_str,
                                                       fig_format))

    # Get the targets file
    drugbank_to_targets_file = os.path.join(toolbox_dir,
                                            'drugbank_to_targets.pcl')
    drugbank_to_targets = cPickle.load(open(drugbank_to_targets_file))

    # Get the ATC file
    drugbank_to_atcs_file = os.path.join(toolbox_dir, 'drugbank_to_atcs.pcl')
    drugbank_to_atcs = cPickle.load(open(drugbank_to_atcs_file))

    # Get the DIANA IDs file
    diana_id_to_drugbank_file = os.path.join(toolbox_dir,
                                             'diana_id_to_drugbank.pcl')
    diana_id_to_drugbank = cPickle.load(open(diana_id_to_drugbank_file))

    print('\nEVALUATION OF GENERAL PERFORMANCE\n')
    repetitions = 25
    n_fold = 10
    analysis_results = {}
    method_to_results = {}
    method_to_probs = {}

    # Get columns for each method
    if consider_se:
        dct_columns, dcg_columns, dcs_columns, dcatc_columns, dcse_columns = diana_analysis.obtain_method_to_columns(
            threshold_list, ATC_SE=consider_se)
    else:
        dct_columns, dcg_columns, dcs_columns = diana_analysis.obtain_method_to_columns(
            threshold_list, ATC_SE=consider_se)

    # Remove ATC columns if different ATC
    if options.different_atc:
        columns = [
            col for col in columns
            if col not in dcatc_columns or col == 'combination'
        ]

    if consider_se:
        if options.different_atc:
            list_methods = [['Combination',
                             columns], ['dctargets', dct_columns],
                            ['dcguild', dcg_columns],
                            ['dcstructure', dcs_columns],
                            ['dcatc', dcatc_columns], ['dcse', dcse_columns],
                            ['random', columns]]
            methods_ordered = [
                'Combination', 'dctargets', 'dcguild', 'dcstructure', 'dcatc',
                'dcse', 'random'
            ]
            method_to_label = {
                'Combination': 'All',
                'dctargets': 'Target',
                'dcguild': 'PPI',
                'dcstructure': 'Structure',
                'dcatc': 'ATC',
                'dcse': 'Side Effects',
                'random': 'Random'
            }
            colors_ordered = [['yellow', 'black'], ['#ff7373', 'red'],
                              ['#32f232', 'green'], ['#4f4f4f', 'black'],
                              ['#e59600', '#966200'], ['#aeaeae', 'black']
                              ]  # yellow, red, green, black, orange, grey
        else:
            list_methods = [['Combination',
                             columns], ['dctargets', dct_columns],
                            ['dcguild', dcg_columns],
                            ['dcstructure', dcs_columns],
                            ['dcatc', dcatc_columns], ['dcse', dcse_columns],
                            ['random', columns]]
            methods_ordered = [
                'Combination', 'dctargets', 'dcguild', 'dcstructure', 'dcatc',
                'dcse', 'random'
            ]
            method_to_label = {
                'Combination': 'All',
                'dctargets': 'Target',
                'dcguild': 'PPI',
                'dcstructure': 'Structure',
                'dcatc': 'ATC',
                'dcse': 'Side Effects',
                'random': 'Random'
            }
            colors_ordered = [
                ['yellow', 'black'], ['#ff7373', 'red'], ['#32f232', 'green'],
                ['#4f4f4f', 'black'], ['#22a9bd', '#0049e5'],
                ['#e59600', '#966200'], ['#aeaeae', 'black']
            ]  # yellow, red, green, black, blue, orange, grey
    else:
        list_methods = [['Combination', columns], ['dctargets', dct_columns],
                        ['dcguild', dcg_columns], ['dcstructure', dcs_columns],
                        ['random', columns]]
        methods_ordered = [
            'Combination', 'dctargets', 'dcguild', 'dcstructure', 'random'
        ]
        method_to_label = {
            'Combination': 'All',
            'dctargets': 'Target',
            'dcguild': 'PPI',
            'dcstructure': 'Structure',
            'random': 'Random'
        }
        colors_ordered = [['white', 'black'], ['#ff7373', 'red'],
                          ['#32f232', 'green'], ['#4f4f4f', 'black'],
                          ['#aeaeae',
                           'black']]  # white, red, green, black, grey

    #-------------------------------------------------#
    #   SELECT DRUG COMBINATIONS WITH DIFFERENT ATC   #
    #-------------------------------------------------#

    if options.different_atc:

        selected_rows = []
        for index, row in df.iterrows():
            (drug_id1, drug_id2) = index.split('---')
            drug1 = diana_id_to_drugbank[drug_id1].upper()
            drug2 = diana_id_to_drugbank[drug_id2].upper()

            atcs_drug1 = set([atc[0] for atc in drugbank_to_atcs[drug1]])
            atcs_drug2 = set([atc[0] for atc in drugbank_to_atcs[drug2]])
            intersection = atcs_drug1 & atcs_drug2
            if len(intersection) == 0:
                selected_rows.append(index)

        df = df.ix[selected_rows]
        dc_data = df[df['combination'] == 1]
        ndc_data = df[df['combination'] == 0]
        num_dc = len(dc_data.index)
        num_ndc = len(ndc_data.index)
        print(
            'Num drug combinations after removing the ones with same ATC in training: {}'
            .format(num_dc))
        print(
            'Num non-drug combinations after removing the ones with same ATC in training: {}'
            .format(num_ndc))

        selected_rows = []
        for index, row in df_validation.iterrows():
            (drug_id1, drug_id2) = index.split('---')
            drug1 = diana_id_to_drugbank[drug_id1].upper()
            drug2 = diana_id_to_drugbank[drug_id2].upper()

            atcs_drug1 = set([atc[0] for atc in drugbank_to_atcs[drug1]])
            atcs_drug2 = set([atc[0] for atc in drugbank_to_atcs[drug2]])
            intersection = atcs_drug1 & atcs_drug2
            if len(intersection) == 0:
                selected_rows.append(index)

        df_validation = df_validation.ix[selected_rows]
        dc_data_val = df_validation[df_validation['combination'] == 1]
        ndc_data_val = df_validation[df_validation['combination'] == 0]
        num_dc_val = len(dc_data_val.index)
        num_ndc_val = len(ndc_data_val.index)
        print(
            'Num drug combinations (in validation) after removing the ones with same ATC in training: {}'
            .format(num_dc_val))
        print(
            'Num non-drug combinations (in validation) after removing the ones with same ATC in training: {}'
            .format(num_ndc_val))

    #--------------------------#
    #   EVALUATE EACH METHOD   #
    #--------------------------#

    for method, columns_method in list_methods:

        print('Evaluating method {}\n'.format(method))

        #------------------------------------------------------------------#
        #   SELECT RELEVANT FEATURES / REDUCE DIMENSIONALITY OF THE DATA   #
        #------------------------------------------------------------------#

        if options.pca:

            # Strategy:
            # We calculate the explained variance ratio for all the features.
            # We define a a cut-off threshold of the minimum variance ratio that we consider relevant.
            # We will count the number of features with explained variance higher than the cut-off defined.
            # Then, we will reduce the dimensionality to the number of features with variance higher than the cut-off.

            variance_cut_off = 0.01
            num_components = 0
            scoring_methods = ['spearman', 'dot_product', 'jaccard']
            df_method = df[columns_method]
            df_val = df_validation[columns_method]
            df_all = pd.concat([df_method, df_val])
            df_raw = df_all.drop('combination', axis=1)
            raw_columns = copy.copy(columns_method)
            raw_columns.remove('combination')
            pca = PCA(n_components=None)
            pca.fit(df_raw)
            values_trans = pca.transform(df_raw)
            explained_variance = pca.explained_variance_ratio_
            for var in explained_variance:
                if var > variance_cut_off:
                    num_components += 1

            if num_components < len(raw_columns):

                print('Number of features:\t{}\n'.format(len(raw_columns)))
                print('Reduction to {} components\n'.format(num_components))

                pca = PCA(n_components=num_components)
                pca.fit(df_raw)
                values_trans = pca.transform(df_raw)
                indexes = df_all.index.values
                df_trans = pd.DataFrame.from_records(values_trans,
                                                     index=indexes)
                df_comb = df_all[['combination']]
                df_pca = pd.concat([df_trans, df_comb], axis=1)
                train_indexes = df_method.index.values
                val_indexes = df_val.index.values

                df_method = df_pca.loc[train_indexes]
                dc_data = df_method[df_method['combination'] == 1]
                ndc_data = df_method[df_method['combination'] == 0]
                num_dc = len(dc_data.index)
                num_ndc = len(ndc_data.index)

                df_val = df_pca.loc[val_indexes]
                dc_data_val = df_val[df_val['combination'] == 1]
                ndc_data_val = df_val[df_val['combination'] == 0]
                num_dc_val = len(dc_data_val.index)
                num_ndc_val = len(ndc_data_val.index)

        else:

            # Manually introduced features
            guild_thresholds = [1, 5]
            rank_scoring = ['spearman', 'dot_product']
            list_scoring = ['jaccard']
            if method == 'Combination' or method == 'random':
                selected_columns = diana_analysis.obtain_columns_best_features(
                    guild_thresholds,
                    rank_scoring,
                    list_scoring,
                    ATC_SE=consider_se)
            else:
                selected_columns = diana_analysis.obtain_columns_best_features_for_specific_method(
                    method, guild_thresholds, rank_scoring, list_scoring)

            # Remove ATC columns if different ATC
            if options.different_atc and consider_se:
                if method != 'dcatc':
                    selected_columns = [
                        col for col in selected_columns
                        if col not in dcatc_columns or col == 'combination'
                    ]

            print('Selected columns: {}\n'.format(', '.join(selected_columns)))
            print('Number of selected features: {}\n'.format(
                len(selected_columns) -
                1))  # We take away the combinations column

            # Define the new table with the selected columns
            df_method = df[selected_columns]
            dc_data = df_method[df_method['combination'] == 1]
            ndc_data = df_method[df_method['combination'] == 0]
            num_dc = len(dc_data.index)
            num_ndc = len(ndc_data.index)

            # Define also the validation table with new columns
            df_val = df_validation[selected_columns]
            dc_data_val = df_val[df_val['combination'] == 1]
            ndc_data_val = df_val[df_val['combination'] == 0]
            num_dc_val = len(dc_data_val.index)
            num_ndc_val = len(ndc_data_val.index)

        #-------------------------#
        #   CLASSIFY DRUG PAIRS   #
        #-------------------------#

        if options.without_repetition:

            # from sklearn.model_selection import train_test_split
            # data, target = df_method.iloc[:, :-1], df_method.iloc[:, -1]
            # X_train, X_test, y_train, y_test = train_test_split(
            #     data, target, test_size=0.1, random_state=0)
            # clf = classifiers[classifier].fit(X_train, y_train)
            # y_pred = clf.predict(X_test)
            # fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
            # auc = metrics.roc_auc_score(y_test, y_pred)
            # analysis_results[method] = auc
            # print('Method: {}. AUC {}.'.format(method, auc))
            # print(fpr)
            # print(tpr)

            # Calculate the number of items per group
            num_items_group_dc = int(float(num_dc) / float(n_fold))
            num_items_group_ndc = int(float(num_ndc) / float(n_fold))
            print('Building {} groups of {} drug combinations'.format(
                n_fold, num_items_group_dc))
            dc_groups = diana_analysis.obtain_n_groups_of_k_length(
                dc_data, n_fold, num_items_group_dc, me_too_drug_combinations)
            print('Building {} groups of {} non-drug combinations'.format(
                n_fold, num_items_group_ndc))
            ndc_groups = diana_analysis.obtain_n_groups_of_k_length(
                ndc_data, n_fold, num_items_group_ndc,
                me_too_drug_combinations)
            merged_groups = [
                pd.concat([x, y]) for x, y in zip(dc_groups, ndc_groups)
            ]

            if method == 'random':
                mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_dummy(
                    n_fold, merged_groups, classifiers[classifier])
            else:
                mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_scikit_with_prob(
                    n_fold, merged_groups, classifiers[classifier])

            analysis_results[method] = mean
            method_to_results[method] = (mean, std)
            method_to_probs[method] = list_prob
            print('Method: {}. AUC mean {}. AUC results: {}'.format(
                method, mean, list_auc))

        else:

            print(
                'Building {} repetition groups of {} (same) DC and {} (different) non-DC'
                .format(repetitions, num_dc, num_dc))
            ndc_repetitions = diana_analysis.obtain_n_groups_of_k_length(
                ndc_data, repetitions, num_dc
            )  # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times

            mean_aucs = [
            ]  # Here we will store the means of AUCs from the cross-validations
            std_aucs = [
            ]  # Here we will store the standard deviations of the AUCs from the cross-validations
            all_aucs = []  # Here we will store ALL the AUCs
            all_probs = []  # Here we store all the probabilities and labels

            if cross_validation:
                num_repetitions = 0
                for ndc_data_equal in ndc_repetitions:

                    num_repetitions += 1
                    num_items_group = int(
                        float(num_dc) / float(n_fold)
                    )  # Calculate the number of items in each group of the cross-validation
                    if num_repetitions == 1:
                        print(
                            'Building {} fold groups of {} DC and {} non-DC x {} repetitions'
                            .format(n_fold, num_items_group, num_items_group,
                                    repetitions))

                    dc_groups = diana_analysis.obtain_n_groups_of_k_length(
                        dc_data, n_fold, num_items_group,
                        me_too_drug_combinations
                    )  # Defining the drug combination groups in each cross-validation step
                    ndc_groups = diana_analysis.obtain_n_groups_of_k_length(
                        ndc_data_equal, n_fold, num_items_group,
                        me_too_drug_combinations
                    )  # Defining the non-drug combination groups in each cross-validation step
                    merged_groups = [
                        pd.concat([x, y])
                        for x, y in zip(dc_groups, ndc_groups)
                    ]

                    if method == 'random':
                        mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_dummy(
                            n_fold, merged_groups, classifiers[classifier])
                    else:
                        mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_scikit_with_prob(
                            n_fold, merged_groups, classifiers[classifier])

                    mean_aucs.append(mean)
                    std_aucs.append(std)
                    all_aucs = all_aucs + list_auc
                    all_probs = all_probs + list_prob

                final_mean = np.mean(all_aucs)
                #final_mean = np.mean(mean_aucs)
                std = np.std(all_aucs)
                mean_std = np.mean(std_aucs)
                std_means = np.std(mean_aucs)
                print('FINAL MEAN: {}'.format(final_mean))
                print('STD: {}\n'.format(std))
                #print('MEAN of STD: {}'.format(mean_std))

            else:
                ndc_repetitions_val = diana_analysis.obtain_n_groups_of_k_length(
                    ndc_data_val, repetitions, num_dc_val)
                num_repetitions = 0
                for ndc_data_equal in ndc_repetitions:
                    ndc_data_equal_val = ndc_repetitions_val[num_repetitions]
                    num_repetitions += 1
                    train = pd.concat([dc_data, ndc_data_equal])
                    test = pd.concat([dc_data_val, ndc_data_equal_val])
                    X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
                    X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]

                    if method == 'random':
                        clf = DummyClassifier().fit(X_train, y_train)
                        y_pred = clf.predict(X_test)
                    else:
                        clf = classifiers[classifier].fit(X_train, y_train)
                        y_pred = clf.predict(X_test)
                    auc = metrics.roc_auc_score(y_test, y_pred)
                    all_aucs.append(auc)

                    # Get probabilities of being drug combination
                    prob = clf.predict_proba(
                        X_test
                    )  # Get the probability used to classify. This is a list, and there is a probability for each class
                    classes = clf.classes_  # This is the order of the classes. The probabilities are given in this order
                    for index in xrange(len(classes)):
                        if classes[index] == 1:
                            dc_index = index  # Obtain in which position is located the probability of being drug combination
                    for p in xrange(len(prob)):
                        dc_prob = prob[p][
                            dc_index]  # We use the index to obtain the probability of being drug combination
                        dc_label = y_test[p]
                        dc_name = y_test.index.values[
                            p]  # We obtain the name of the drug combination
                        array = [
                            dc_prob, dc_label, dc_name
                        ]  # Create an array with the probability, the label and the name of the pair
                        all_probs.append(array)  # Append the array in all_prob

                final_mean = np.mean(all_aucs)
                std = np.std(all_aucs)
                print('FINAL MEAN: {}'.format(final_mean))
                print('STD: {}\n'.format(std))

            # Store the distribution of AUCs in the dictionary
            analysis_results[method] = all_aucs
            method_to_results[method] = (final_mean, std)
            method_to_probs[method] = all_probs

    if options.without_repetition:

        pass

    else:

        #------------------------------#
        #   PLOT DISTRIBUTION OF AUC   #
        #------------------------------#

        methods_without_atc = copy.copy(methods_ordered)
        methods_without_atc.remove('dcatc')
        all_data = [analysis_results[method] for method in methods_without_atc]
        data_labels = [
            method_to_label[method] for method in methods_without_atc
        ]

        fig = pylab.figure(dpi=300)
        ax = pylab.axes()
        pos = 1
        all_positions = []

        for x in xrange(len(methods_without_atc)):

            # plot violin plot
            print(data_labels[x])
            print(all_data[x])
            parts = ax.violinplot(all_data[x],
                                  positions=[pos],
                                  showmeans=False,
                                  showmedians=True)

            all_positions.append(pos)
            pos += 2

            # Change color of the body
            for pc in parts['bodies']:
                pc.set_facecolor(colors_ordered[x][0])

            # Change color of the segments
            parts['cmedians'].set_color(colors_ordered[x][1])
            parts['cbars'].set_color(colors_ordered[x][1])
            parts['cmins'].set_color(colors_ordered[x][1])
            parts['cmaxes'].set_color(colors_ordered[x][1])

        # adding horizontal grid lines
        ax.yaxis.grid(True)
        ax.set_xticks([y + 1 for y in range(len(all_data))])
        ax.set_ylabel('Distribution of AUC values')
        # add x-tick labels
        plt.setp(ax, xticks=all_positions, xticklabels=data_labels)
        #plt.xticks(rotation=15)
        # Save
        pylab.savefig(plot_name, format=fig_format)
        plt.show()

    #---------------------------------#
    #   PRINT THE RESULTS IN A FILE   #
    #---------------------------------#

    tables_dir = os.path.join(analysis_dir, 'tables')
    create_directory(tables_dir)
    output_file = os.path.join(
        tables_dir, 'general_performance{}{}.txt'.format(atc_str, pca_str))
    with open(output_file, 'w') as output_fd:
        for method, results in sorted(method_to_results.iteritems(),
                                      key=lambda (x, y): y[0],
                                      reverse=True):
            output_fd.write('{}\t{}\t{}\n'.format(method, results[0],
                                                  results[1]))

    #-------------------------------------------------------------------#
    #   TABLE OF COMPARISON OF AUC DISTRIBUTIONS USING MANN WHITNEY U   #
    #-------------------------------------------------------------------#

    mannwhitney_file = os.path.join(
        tables_dir,
        'general_performance_mannwhitney{}{}.txt'.format(atc_str, pca_str))
    with open(mannwhitney_file, 'w') as mannwhitney_fd:

        mann_results = {}

        mannwhitney_fd.write(' ')
        for method in methods_ordered:
            mannwhitney_fd.write('\t{}'.format(method_to_label[method]))
        mannwhitney_fd.write('\n')

        # Perform the comparisons
        for method1 in methods_ordered:
            mann_results.setdefault(method1, {})
            for method2 in methods_ordered:
                if method1 == method2:
                    mann_results[method1][method2] = '-'
                else:
                    method1_dist = analysis_results[method1]
                    method2_dist = analysis_results[method2]
                    stat, pval = scipy.stats.mannwhitneyu(
                        method1_dist, method2_dist)
                    mann_results[method1][method2] = [stat, pval]

        # Write the table of crossings
        for method1 in methods_ordered:
            mannwhitney_fd.write('{}'.format(method_to_label[method1]))
            for method2 in methods_ordered:
                if method1 == method2:
                    mannwhitney_fd.write('\t-')
                else:
                    stat, pval = mann_results[method1][method2]
                    mannwhitney_fd.write('\t{}, {:.2e}'.format(stat, pval))
            mannwhitney_fd.write('\n')

    #-------------------------------------------------------------------------#
    #   PRINT THE MEAN OF PROBABILITIES OF BEING DRUG COMBINATION IN A FILE   #
    #-------------------------------------------------------------------------#

    prob_file = os.path.join(
        tables_dir,
        'general_performance_probabilities{}{}.txt'.format(atc_str, pca_str))
    with open(prob_file, 'w') as prob_fd:
        for method in methods_ordered:
            dc2scoresmean = obtain_drug_combination_scores_mean(
                method_to_probs[method])
            for dc, mean in sorted(dc2scoresmean.iteritems(),
                                   key=lambda (x, y): y,
                                   reverse=True):
                drug_id1, drug_id2 = dc.split('---')
                drug1 = diana_id_to_drugbank[drug_id1].upper()
                drug2 = diana_id_to_drugbank[drug_id2].upper()
                atcs_drug1 = ', '.join(
                    sorted(set([atc[0] for atc in drugbank_to_atcs[drug1]])))
                atcs_drug2 = ', '.join(
                    sorted(set([atc[0] for atc in drugbank_to_atcs[drug2]])))
                prob_fd.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                    method, drug1, drug2, atcs_drug1, atcs_drug2, mean))

    #-------------------#
    #   SAVE ALL AUCs   #
    #-------------------#

    auc_file = os.path.join(
        tables_dir,
        'general_performance_aucs{}{}.txt'.format(atc_str, pca_str))
    with open(auc_file, 'w') as auc_fd:
        for method in methods_ordered:
            auc_fd.write('{}\t{}\n'.format(
                method, ','.join([str(x) for x in analysis_results[method]])))

    # fig = pylab.figure(dpi=300)
    # ax = pylab.axes()
    # #pylab.hold(True)
    # pos = 1
    # col_num = 0

    # xticks = [] # Define the places in which the labels will be
    # xlabels = [] # Define the labels (the names of the methods)
    # #colors = [ ['#9ed0ff, blue'], ['#32f232', 'green'], ['#fbc562', '#d48900'], ['#ff7373', '#b80000'], ['grey', 'black'] ]

    # for method in methods_ordered:

    #     positions = []
    #     positions.append(pos) # Define the positions of the boxplots
    #     pos+=2 # Add separation between boxplots
    #     xlabels.append(method_to_label[method]) # Add the method used at the x axis

    #     # Boxplot group
    #     #bp = boxplot(data, positions = positions, widths = 0.6)
    #     bp = pylab.boxplot(analysis_results[method], positions = positions, widths = 0.6, patch_artist=True)

    #     tick = np.mean(positions) # The label will be at the mean of the positions (in the middle)
    #     xticks.append(tick)

    # # Set axes limits and labels
    # pylab.xlim(0,pos-1)
    # pylab.ylim(0,1)
    # ax.set_xticklabels(xlabels)
    # ax.set_xticks(xticks)
    # pylab.xlabel('Type of data')
    # pylab.ylabel('Distribution of AUC values')

    # fig.autofmt_xdate()
    # pylab.savefig(plot_name, format=fig_format)
    # pylab.show()

    # End marker for time
    end = time.time()
    print(
        '\n  DIANA INFO:\tTIME OF EXECUTION: {:.3f} seconds or {:.3f} minutes.\n'
        .format(end - start, (end - start) / 60))

    return