def ExtractPlotScores(classification_file, score_file): # Determines the separators for both files. sep1 = mf.determine_separator(classification_file) sep2 = mf.determine_separator(score_file) # Creates dataframes from both files. pathogenicity_dataframe = pd.read_csv(classification_file, sep=sep1, header=0) score_dataframe = pd.read_csv(score_file, sep=sep2, header=0) # Inquires about which column in the curated file contains pathogenicity data. Sets the variant # identifier to the first column. mf.print_columns_with_index(pathogenicity_dataframe) pathogenicity_index = mf.get_int_answer('What column states the pathogenicity? ') pathogenicity_column = list(pathogenicity_dataframe.columns)[pathogenicity_index - 1] coordinate_index_1 = mf.get_int_answer('What column identifies the variant? ') genomic_coordinate_column1 = list(pathogenicity_dataframe.columns)[coordinate_index_1 - 1] # Inquires about which column in the output file of the predictor method contains pathogenicity # scores. Sets the variant identifier to the first column, which must match a value in the first # column of the curated file. mf.print_columns_with_index(score_dataframe) score_index = mf.get_int_answer('What column gives the scores? ') score_column = list(score_dataframe.columns)[score_index - 1] coordinate_index_2 = mf.get_int_answer('What column identifies the variant? ') genomic_coordinate_column2 = list(score_dataframe.columns)[coordinate_index_2 - 1] # Creates a dictionary to store each variant's pathogenicity and pathogenicity score. Creates # a list that stores each variant identifier associated with a reported score. scores = {'pathogenicity': [], 'scores': []} chromosome_list = [] for index, row in score_dataframe.iterrows(): # Iterates through each variant in the output file. # Determines if the given variant in the outfile has a score. If so, the variable # 'after' contains the variant's score. Otherwise, 'after' is empty. to_parse = score_dataframe[score_column].iloc[index] phrase = 'REVEL_score=' before, at, after = to_parse.partition(phrase) if ( after != ''): # If the variant reports a score. # And the variant has no previously reported score. if ( score_dataframe[genomic_coordinate_column2].iloc[index] not in chromosome_list ): # Add the variant to the list of scored variants. Then append the score to the list # of scores and append the variants pathogenicity to the list of pathogencities. chromosome_list.append(score_dataframe[genomic_coordinate_column2].iloc[index]) scores['scores'].append(float(after)) genomic_coordinate = chromosome_list[-1] pathogenicity = pathogenicity_dataframe[pathogenicity_dataframe[genomic_coordinate_column1] == genomic_coordinate][pathogenicity_column] if ( (pathogenicity.values[0] == 'Benign') | (pathogenicity.values[0] == 'Likely_benign') ): scores['pathogenicity'].append("Benign") elif ( (pathogenicity.values[0] == 'Pathogenic') | (pathogenicity.values[0] == 'Likely_pathogenic') ): scores['pathogenicity'].append('Pathogenic') # Construct a dataframe from the dictionary, and then we create a boxplot from the data. df = pd.DataFrame.from_dict(scores) if ( len(sys.argv) == 4 ): df.to_csv(output_file) print('variants scored:', df.shape[0], '/', pathogenicity_dataframe.shape[0]) # , '/', len((pathogenicity_dataframe.shape())[1]) boxplt = df.boxplot(by='pathogenicity') plt.show()
def DataExtraction(read_file, write_file): # Creates a dataframe from read_file and opens a new file for writing. sep = mf.determine_separator(read_file) df = pd.read_csv(read_file, sep=sep, header=0) write_file = open(write_file, 'w+') # Prints the columns of the file to the console. Gets the name of the column to extract # variant identifiers from. Optionally writes the pathogenicity into a second column. mf.print_columns_with_index(df) col1 = mf.get_int_answer('What is the number of the column to sort by? ') coordinate_column = list(df.columns)[col1 - 1] df.sort_values(by=df.columns[col1 - 1]) first_question = mf.get_yes_no('Write pathogenicity (y/n)? ') if ((first_question == 'y') | (first_question == "Y")): second_question = mf.get_int_answer('Pathogenicity\'s column number? ') pathogenicity_column = list(df.columns)[second_question - 1] # Writes the variant identifier and the pathogencity to output_file. for index, row in df.iterrows(): write_file.write(df[coordinate_column].iloc[index] + ',') if ((first_question == 'y') | (first_question == "Y")): write_file.write(df[pathogenicity_column].iloc[index]) write_file.write('\n') write_file.close()
def RocPlot(data_file): # Determines the separator type associated with the file format of the data file. sep = mf.determine_separator(data_file) # Creates a partition of the interval [0,1]. Creates two empty lists; one will store the true # positive rate and the other will store the false positive rates. thresholds = [float(i)/100 for i in range(101)] x = [] y = [] df = pd.read_csv(sys.argv[1], sep=',') # Determines what term will be considered positive and which will be negative. positive_term = 'Pathogenic' negative_term = 'Benign' positives = df[df['pathogenicity'] == positive_term].shape[0] negatives = df[df['pathogenicity'] == negative_term].shape[0] # For each threshold in the partition, determines the true positive rate and the true negative # rate for the data. for threshold in thresholds: true_positives = 0 false_positives = 0 for index, row in df.iterrows(): score = float(row.values[2]) if ( (score >= threshold) & (row.values[1] == positive_term) ): true_positives += 1 elif ( (score >= threshold) & (row.values[1] == negative_term) ): false_positives += 1 true_positive_rate = true_positives/positives false_positive_rate = false_positives/negatives x.append(false_positive_rate) y.append(true_positive_rate) #------------------------------------------------------------------------------------------------ # Creates an empty figure, with total area of 1. figure = plt.figure(figsize=(6, 6)) ax = figure.add_subplot(1, 1, 1) ax.set_xlim(0, 1) ax.set_ylim(0, 1) ax.xaxis.set_major_locator(MultipleLocator(0.250)) ax.yaxis.set_major_locator(MultipleLocator(0.250)) ax.xaxis.set_minor_locator(MultipleLocator(0.125)) ax.yaxis.set_minor_locator(MultipleLocator(0.125)) ax.tick_params(which='major', length=10.0) ax.tick_params(which='minor', length=5.0) ax.set_ylabel('True Positive Rate') ax.set_xlabel('False Positive Rate') # Draws the grid and plots the ROC curce and the baseline curve. ax.grid(linestyle=':', linewidth=0.5, color='black') ax.plot(x, y, color='blue', linewidth='3.0') ax.plot([0,1], [0,1], color='orange', linestyle='--') plt.show()
def Optimization(data_file): # Creates a dataframe from data_file. sep = mf.determine_separator(sys.argv[1]) df = pd.read_csv(sys.argv[1], sep=',') # Creates a partition of the interval [0,1]. Creates two empty lists; one will store the accuracy # and the other will store the threshold. thresholds = [float(i) / 100 for i in range(101)] x = [] y = [] # Determines what term will be considered positive and which will be negative. positive_term = 'Pathogenic' negative_term = 'Benign' total = df['pathogenicity'].shape[0] # For each threshold in the partition, determines the accuracy of the predictor. for threshold in thresholds: true_positives = 0 true_negatives = 0 for index, row in df.iterrows(): score = float(row.values[2]) if ((score >= threshold) & (row.values[1] == positive_term)): true_positives += 1 elif ((score < threshold) & (row.values[1] == negative_term)): true_negatives += 1 accuracy = (true_positives + true_negatives) / total x.append(threshold) y.append(accuracy) print('Max : ' + str(max(y)) + '\nAt : ', end='') print(*[x[index] for index, value in enumerate(y) if value == max(y)], sep=', ') #------------------------------------------------------------------------------------------------ # Creates an empty figure, with total area of 1. figure = plt.figure(figsize=(6, 6)) ax = figure.add_subplot(1, 1, 1) ax.set_xlim(0, 1) ax.set_ylim(0, 1) ax.xaxis.set_major_locator(MultipleLocator(0.250)) ax.yaxis.set_major_locator(MultipleLocator(0.250)) ax.xaxis.set_minor_locator(MultipleLocator(0.125)) ax.yaxis.set_minor_locator(MultipleLocator(0.125)) ax.tick_params(which='major', length=10.0) ax.tick_params(which='minor', length=5.0) ax.set_ylabel('Accuracy') ax.set_xlabel('Threshold') # Draws the grid and plots the response curve. ax.grid(linestyle=':', linewidth=0.5, color='black') ax.plot(x, y, color='blue', linewidth='3.0') plt.show()
def CountUniqueElementsColumn(read_file): # Creates a csv file from read_file. sep = mf.determine_separator(read_file) df = pd.read_csv(read_file, sep=sep, header=0) # Print the columns of read_file to the console. Gets the name of the column to be printed # from the user. mf.print_columns_with_index(df) column_number = mf.get_int_answer('What column should be accumulated? ') column_name = list(df.columns)[column_number - 1] # Creates a list of the unique values from the specified column. Prints its length, and # optionally each item and its frequency. unique_values = pd.unique(df[column_name]) print('Number of unique items: ' + str(len(unique_values))) show_items = mf.get_yes_no('Show item/counts (y/n)? ') if ((show_items == 'y') | (show_items == 'Y')): for item in unique_values: print(str(item) + ': ' + str(df[df[column_name] == item].shape[0]))
def MoveVariants(take_from, add_to, modified_take_from, modified_add_to): # Create dataframes from take_from and add_to. I will refer to the dataframes by their # respective filenames. sep1 = mf.determine_separator(take_from) sep2 = mf.determine_separator(add_to) df1 = pd.read_csv(take_from, sep=sep1, header=0) df2 = pd.read_csv(add_to, sep=sep2, header=0) # Gets the name of the column of the variants' pathogenicity in take_from. mf.print_columns_with_index(df1) df1_column_number = mf.get_int_answer( 'What column contains the pathogenicity in the first file? ') df1_column_name = list(df1.columns)[df1_column_number - 1] df1_variant_column = list(df1.columns)[0] # Gets the name of the column of the variants' pathogenicity add_to. mf.print_columns_with_index(df2) df2_column_number = mf.get_int_answer( 'What column contains the pathogenicity in the second file? ') df2_column_name = list(df2.columns)[df2_column_number - 1] df2_variant_column = list(df2.columns)[0] # Gets the number of variants to transfer and of what pathogenicity. answer = input('Move pathogenic or benign variants (p/b)? ') number_to_move = mf.get_int_answer('Move how many variants? ') # Splits take_from into two parts, one containing only pathogenic variants and the other only benign. df1_pathogenic = df1[(df1[df1_column_name] == 'Pathogenic') | (df1[df1_column_name] == 'Likely_pathogenic')] df1_benign = df1[(df1[df1_column_name] == 'Benign') | (df1[df1_column_name] == 'Likely_benign')] if (answer == 'p'): # If the user wants to move pathogenic variants. # Randomly selects number_to_move pathogenic variants from take_from's pathogenic half. number_of_variants = df1_pathogenic.shape[0] indices = random.sample(range(number_of_variants), number_to_move) everything_else = [ i for i in range(number_of_variants) if i not in indices ] # Creates a dataframe of the pathogenic variants to move. Renames the columns to match add_to's. df1_to_move = df1_pathogenic.iloc[indices][[ df1_variant_column, df1_column_name ]] df1_to_move = df1_to_move.rename(index=str, columns={ df1_variant_column: df2_variant_column, df1_column_name: df2_column_name }) # Generates the two modified dataframes, and generates the two new csv files. df_out_2 = df2[[df2_variant_column, df2_column_name]].append(df1_to_move, ignore_index=True) df_out_1 = df1_benign.append(df1_pathogenic.iloc[everything_else]) df_out_1.to_csv(modified_take_from, index=False) df_out_2.to_csv(modified_add_to, index=False) else: # If the user wants to move benign variants. # Randomly selects number_to_move benign variants from take_from's benign half. number_of_variants = df1_benign.shape[0] indices = random.sample(range(number_of_variants), number_to_move) everything_else = [ i for i in range(number_of_variants) if i not in indices ] # Creates a dataframe of the benign variants to move. Renames the columns to match add_to's. df1_to_move = df1_benign.iloc[indices][[ df1_variant_column, df1_column_name ]] df1_to_move = df1_to_move.rename(index=str, columns={ df1_variant_column: df2_variant_column, df1_column_name: df2_column_name }) # Generates the two modified dataframes, and generates the two new csv files. df_out_2 = df2[[df2_variant_column, df2_column_name]].append(df1_to_move, ignore_index=True) df_out_1 = df1_pathogenic.append(df1_benign.iloc[everything_else]) df_out_1.to_csv(modified_take_from, index=False) df_out_2.to_csv(modified_add_to, index=False)
import MiscFunctions as mf import random # For pseudo-random numbers import time # To set the seed #------------------------------------------------------------------------------------------------ ## Move a specified number of pathogenic or benign variants from a csv file to another file. # this doesn't modify the original files, rather creates new ones with new data. # @param take_from : The csv-like file to remove variants from. # @param add_to : The csv-like file to add variants to. # @param modified_take_from : The modified version of take_from # @param modified_add_to : The modified version of add_to # def MoveVariants(take_from, add_to, modified_take_from, modified_add_to) # Create dataframes from take_from and add_to. I will refer to the dataframes by their # respective filenames. sep1 = mf.determine_separator(take_from) sep2 = mf.determine_separator(add_to) df1 = pd.read_csv(take_from, sep=sep1, header=0) df2 = pd.read_csv(add_to, sep=sep2, header=0) # Gets the name of the column of the variants' pathogenicity in take_from. mf.print_columns_with_index(df1) df1_column_number = mf.get_int_answer('What column contains the pathogenicity in the first file? ') df1_column_name = list(df1.columns)[df1_column_number - 1] df1_variant_column = list(df1.columns)[0] # Gets the name of the column of the variants' pathogenicity add_to. mf.print_columns_with_index(df2) df2_column_number = mf.get_int_answer('What column contains the pathogenicity in the second file? ') df2_column_name = list(df2.columns)[df2_column_number - 1] df2_variant_column = list(df2.columns)[0]
def Optimization(data_file): # Creates a dataframe from data_file. sep = mf.determine_separator(sys.argv[1]) df = pd.read_csv(sys.argv[1], sep=',') scores = (df['Score'].values).copy() scores.sort() # Sets the resolution for the plot granularity = 1 for index in range(len(scores) - 1): difference = scores[index + 1] - scores[index] if (difference < granularity): granularity = difference if (granularity < .001): granularity = .001 # Creates a partition of the interval [0,1]. Creates two empty lists; one will store the accuracy # and the other will store the threshold. thresholds = [ float(i) * granularity for i in range(int(1 / granularity) + 1) ] x = [] y = [] # Determines what term will be considered positive and which will be negative. positive_term = 'Pathogenic' negative_term = 'Benign' total = df['Pathogenicity'].shape[0] print(total) # For each threshold in the partition, determines the accuracy of the predictor. for threshold in thresholds: true_positives = 0 true_negatives = 0 for index, row in df.iterrows(): score = float(row.values[2]) if ((score >= threshold) & (row.values[1] == positive_term)): true_positives += 1 elif ((score < threshold) & (row.values[1] == negative_term)): true_negatives += 1 accuracy = (true_positives + true_negatives) / total x.append(threshold) y.append(accuracy) # Prints the threshold(s) that optimizes the accuracy and computes their average. max_values = [x[index] for index, value in enumerate(y) if value == max(y)] print('Resolution : ' + str(granularity)) print('Max : ' + str(max(y))) print('Mean : ' + str(stat.mean(max_values))) print('Median : ' + str(stat.median(max_values))) #------------------------------------------------------------------------------------------------ # Creates an empty figure, with total area of 1. figure = plt.figure(figsize=(6, 6)) ax = figure.add_subplot(1, 1, 1) ax.set_xlim(0, 1) ax.set_ylim(0, 1) ax.xaxis.set_major_locator(MultipleLocator(0.250)) ax.yaxis.set_major_locator(MultipleLocator(0.250)) ax.xaxis.set_minor_locator(MultipleLocator(0.125)) ax.yaxis.set_minor_locator(MultipleLocator(0.125)) ax.tick_params(which='major', length=10.0) ax.tick_params(which='minor', length=5.0) ax.set_ylabel('Accuracy') ax.set_xlabel('Threshold') # Draws the grid and plots the response curve. ax.grid(linestyle=':', linewidth=0.5, color='black') ax.plot(x, y, color='blue', linewidth='2.0') ax.plot( [stat.median(max_values), stat.median(max_values)], [0, max(y)], color='red', linestyle='--', ) plt.title('Accuracy of EA on Training Set as a Function of Threshold') plt.show()
def RocPlot(data_file): # Determines the separator type associated with the file format of the data file. sep = mf.determine_separator(data_file) df = pd.read_csv(sys.argv[1], sep=sep) scores = (df['Score'].values).copy() binary_classifications = df['Pathogenicity'].replace({ 'Pathogenic': 1, 'Benign': 0 }) AUC = roc_auc_score(list(binary_classifications.values), scores) scores.sort() # Sets the resolution for the plot granularity = 1 for index in range(len(scores) - 1): difference = scores[index + 1] - scores[index] if (difference < granularity): granularity = difference if (granularity < .001): granularity = .001 # Creates a partition of the interval [0,1]. Creates two empty lists; one will store the true # positive rate and the other will store the false positive rates. thresholds = [ float(i) * granularity for i in range(int(1 / granularity) + 1) ] x = [] y = [] # Determines what term will be considered positive and which will be negative. positive_term = 'Pathogenic' negative_term = 'Benign' positives = df[df['Pathogenicity'] == positive_term].shape[0] negatives = df[df['Pathogenicity'] == negative_term].shape[0] # For each threshold in the partition, determines the true positive rate and the true negative # rate for the data. for threshold in thresholds: true_positives = 0 false_positives = 0 for index, row in df.iterrows(): score = float(row.values[2]) if ((score >= threshold) & (row.values[1] == positive_term)): true_positives += 1 elif ((score >= threshold) & (row.values[1] == negative_term)): false_positives += 1 true_positive_rate = true_positives / positives false_positive_rate = false_positives / negatives x.append(false_positive_rate) y.append(true_positive_rate) #------------------------------------------------------------------------------------------------ # Creates an empty figure, with total area of 1. figure = plt.figure(figsize=(6, 6)) ax = figure.add_subplot(1, 1, 1) ax.set_xlim(0, 1) ax.set_ylim(0, 1) ax.xaxis.set_major_locator(MultipleLocator(0.250)) ax.yaxis.set_major_locator(MultipleLocator(0.250)) ax.xaxis.set_minor_locator(MultipleLocator(0.125)) ax.yaxis.set_minor_locator(MultipleLocator(0.125)) ax.tick_params(which='major', length=10.0) ax.tick_params(which='minor', length=5.0) ax.set_ylabel('True Positive Rate') ax.set_xlabel('False Positive Rate') # Draws the grid and plots the ROC curce and the baseline curve. ax.grid(linestyle=':', linewidth=0.5, color='black') ax.plot(x, y, color='blue', linewidth='3.0') ax.plot([0, 1], [0, 1], color='orange', linestyle='--') plt.title('ROC Plot of REVEL on Test Set') ax.legend(['AUC: ' + str(round(AUC, 3))], loc=4) plt.show()