예제 #1
0
def ExtractPlotScores(classification_file, score_file):
	# Determines the separators for both files.
	sep1 = mf.determine_separator(classification_file)
	sep2 = mf.determine_separator(score_file)

	# Creates dataframes from both files.
	pathogenicity_dataframe = pd.read_csv(classification_file, sep=sep1, header=0)
	score_dataframe = pd.read_csv(score_file, sep=sep2, header=0)

	# Inquires about which column in the curated file contains pathogenicity data. Sets the variant
	# identifier to the first column.
	mf.print_columns_with_index(pathogenicity_dataframe)
	pathogenicity_index  = mf.get_int_answer('What column states the pathogenicity? ')
	pathogenicity_column = list(pathogenicity_dataframe.columns)[pathogenicity_index - 1]
	coordinate_index_1  = mf.get_int_answer('What column identifies the variant? ')
	genomic_coordinate_column1 = list(pathogenicity_dataframe.columns)[coordinate_index_1 - 1]

	# Inquires about which column in the output file of the predictor method contains pathogenicity
	# scores. Sets the variant identifier to the first column, which must match a value in the first
	# column of the curated file.
	mf.print_columns_with_index(score_dataframe)
	score_index = mf.get_int_answer('What column gives the scores? ')
	score_column = list(score_dataframe.columns)[score_index - 1]
	coordinate_index_2  = mf.get_int_answer('What column identifies the variant? ')
	genomic_coordinate_column2 = list(score_dataframe.columns)[coordinate_index_2 - 1]

	# Creates a dictionary to store each variant's pathogenicity and pathogenicity score. Creates
	# a list that stores each variant identifier associated with a reported score.
	scores = {'pathogenicity': [], 'scores': []}
	chromosome_list = []

	for index, row in score_dataframe.iterrows(): # Iterates through each variant in the output file.
		# Determines if the given variant in the outfile has a score. If so, the variable
		# 'after' contains the variant's score. Otherwise, 'after' is empty.
		to_parse = score_dataframe[score_column].iloc[index]
		phrase = 'REVEL_score='
		before, at, after = to_parse.partition(phrase)

		if ( after != ''): # If the variant reports a score.
			# And the variant has no previously reported score.
			if ( score_dataframe[genomic_coordinate_column2].iloc[index] not in chromosome_list ):
				# Add the variant to the list of scored variants. Then append the score to the list
				# of scores and append the variants pathogenicity to the list of pathogencities.
				chromosome_list.append(score_dataframe[genomic_coordinate_column2].iloc[index])
				scores['scores'].append(float(after))
				genomic_coordinate = chromosome_list[-1]
				pathogenicity = pathogenicity_dataframe[pathogenicity_dataframe[genomic_coordinate_column1] == genomic_coordinate][pathogenicity_column]
				if ( (pathogenicity.values[0] == 'Benign') | (pathogenicity.values[0] == 'Likely_benign') ):
					scores['pathogenicity'].append("Benign")
				elif ( (pathogenicity.values[0] == 'Pathogenic') | (pathogenicity.values[0] == 'Likely_pathogenic') ):
					scores['pathogenicity'].append('Pathogenic')

	# Construct a dataframe from the dictionary, and then we create a boxplot from the data.
	df = pd.DataFrame.from_dict(scores)
	if ( len(sys.argv) == 4 ):
		df.to_csv(output_file)
	print('variants scored:', df.shape[0], '/', pathogenicity_dataframe.shape[0])
	#  , '/', len((pathogenicity_dataframe.shape())[1])
	boxplt = df.boxplot(by='pathogenicity')
	plt.show()
예제 #2
0
def DataExtraction(read_file, write_file):
    # Creates a dataframe from read_file and opens a new file for writing.
    sep = mf.determine_separator(read_file)
    df = pd.read_csv(read_file, sep=sep, header=0)
    write_file = open(write_file, 'w+')

    # Prints the columns of the file to the console. Gets the name of the column to extract
    # variant identifiers from. Optionally writes the pathogenicity into a second column.
    mf.print_columns_with_index(df)
    col1 = mf.get_int_answer('What is the number of the column to sort by? ')
    coordinate_column = list(df.columns)[col1 - 1]
    df.sort_values(by=df.columns[col1 - 1])
    first_question = mf.get_yes_no('Write pathogenicity (y/n)? ')
    if ((first_question == 'y') | (first_question == "Y")):
        second_question = mf.get_int_answer('Pathogenicity\'s column number? ')
        pathogenicity_column = list(df.columns)[second_question - 1]

    # Writes the variant identifier and the pathogencity to output_file.
    for index, row in df.iterrows():
        write_file.write(df[coordinate_column].iloc[index] + ',')
        if ((first_question == 'y') | (first_question == "Y")):
            write_file.write(df[pathogenicity_column].iloc[index])
        write_file.write('\n')

    write_file.close()
예제 #3
0
def RocPlot(data_file):
	# Determines the separator type associated with the file format of the data file.
	sep = mf.determine_separator(data_file)

	# Creates a partition of the interval [0,1]. Creates two empty lists; one will store the true
	# positive rate and the other will store the false positive rates.
	thresholds = [float(i)/100 for i in range(101)]
	x = []
	y = []

	df = pd.read_csv(sys.argv[1], sep=',')

	# Determines what term will be considered positive and which will be negative.
	positive_term = 'Pathogenic'
	negative_term = 'Benign'
	positives = df[df['pathogenicity'] == positive_term].shape[0]
	negatives = df[df['pathogenicity'] == negative_term].shape[0]

	# For each threshold in the partition, determines the true positive rate and the true negative
	# rate for the data.
	for threshold in thresholds:
		true_positives  = 0
		false_positives = 0
		for index, row in df.iterrows():
			score = float(row.values[2])
			if ( (score >= threshold) & (row.values[1] == positive_term) ):
				true_positives += 1
			elif ( (score >= threshold) & (row.values[1] == negative_term) ):
				false_positives += 1

		true_positive_rate = true_positives/positives
		false_positive_rate = false_positives/negatives

		x.append(false_positive_rate)
		y.append(true_positive_rate)

	#------------------------------------------------------------------------------------------------
	# Creates an empty figure, with total area of 1.
	figure = plt.figure(figsize=(6, 6))
	ax = figure.add_subplot(1, 1, 1)
	ax.set_xlim(0, 1)
	ax.set_ylim(0, 1)
	ax.xaxis.set_major_locator(MultipleLocator(0.250))
	ax.yaxis.set_major_locator(MultipleLocator(0.250))
	ax.xaxis.set_minor_locator(MultipleLocator(0.125))
	ax.yaxis.set_minor_locator(MultipleLocator(0.125))
	ax.tick_params(which='major', length=10.0)
	ax.tick_params(which='minor', length=5.0)
	ax.set_ylabel('True Positive Rate')
	ax.set_xlabel('False Positive Rate')

	# Draws the grid and plots the ROC curce and the baseline curve.
	ax.grid(linestyle=':', linewidth=0.5, color='black')
	ax.plot(x, y, color='blue', linewidth='3.0')
	ax.plot([0,1], [0,1], color='orange', linestyle='--')
	plt.show()
예제 #4
0
def Optimization(data_file):
    # Creates a dataframe from data_file.
    sep = mf.determine_separator(sys.argv[1])
    df = pd.read_csv(sys.argv[1], sep=',')

    # Creates a partition of the interval [0,1]. Creates two empty lists; one will store the accuracy
    # and the other will store the threshold.
    thresholds = [float(i) / 100 for i in range(101)]
    x = []
    y = []

    # Determines what term will be considered positive and which will be negative.
    positive_term = 'Pathogenic'
    negative_term = 'Benign'
    total = df['pathogenicity'].shape[0]

    # For each threshold in the partition, determines the accuracy of the predictor.
    for threshold in thresholds:
        true_positives = 0
        true_negatives = 0
        for index, row in df.iterrows():
            score = float(row.values[2])
            if ((score >= threshold) & (row.values[1] == positive_term)):
                true_positives += 1
            elif ((score < threshold) & (row.values[1] == negative_term)):
                true_negatives += 1
        accuracy = (true_positives + true_negatives) / total
        x.append(threshold)
        y.append(accuracy)

    print('Max : ' + str(max(y)) + '\nAt  : ', end='')
    print(*[x[index] for index, value in enumerate(y) if value == max(y)],
          sep=', ')

    #------------------------------------------------------------------------------------------------
    # Creates an empty figure, with total area of 1.
    figure = plt.figure(figsize=(6, 6))
    ax = figure.add_subplot(1, 1, 1)
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.xaxis.set_major_locator(MultipleLocator(0.250))
    ax.yaxis.set_major_locator(MultipleLocator(0.250))
    ax.xaxis.set_minor_locator(MultipleLocator(0.125))
    ax.yaxis.set_minor_locator(MultipleLocator(0.125))
    ax.tick_params(which='major', length=10.0)
    ax.tick_params(which='minor', length=5.0)
    ax.set_ylabel('Accuracy')
    ax.set_xlabel('Threshold')

    # Draws the grid and plots the response curve.
    ax.grid(linestyle=':', linewidth=0.5, color='black')
    ax.plot(x, y, color='blue', linewidth='3.0')
    plt.show()
def CountUniqueElementsColumn(read_file):
    # Creates a csv file from read_file.
    sep = mf.determine_separator(read_file)
    df = pd.read_csv(read_file, sep=sep, header=0)

    # Print the columns of read_file to the console. Gets the name of the column to be printed
    # from the user.
    mf.print_columns_with_index(df)
    column_number = mf.get_int_answer('What column should be accumulated? ')
    column_name = list(df.columns)[column_number - 1]

    # Creates a list of the unique values from the specified column. Prints its length, and
    # optionally each item and its frequency.
    unique_values = pd.unique(df[column_name])
    print('Number of unique items: ' + str(len(unique_values)))
    show_items = mf.get_yes_no('Show item/counts (y/n)? ')
    if ((show_items == 'y') | (show_items == 'Y')):
        for item in unique_values:
            print(str(item) + ': ' + str(df[df[column_name] == item].shape[0]))
예제 #6
0
def MoveVariants(take_from, add_to, modified_take_from, modified_add_to):
    # Create dataframes from take_from and add_to. I will refer to the dataframes by their
    # respective filenames.
    sep1 = mf.determine_separator(take_from)
    sep2 = mf.determine_separator(add_to)
    df1 = pd.read_csv(take_from, sep=sep1, header=0)
    df2 = pd.read_csv(add_to, sep=sep2, header=0)

    # Gets the name of the column of the variants' pathogenicity in take_from.
    mf.print_columns_with_index(df1)
    df1_column_number = mf.get_int_answer(
        'What column contains the pathogenicity in the first file? ')
    df1_column_name = list(df1.columns)[df1_column_number - 1]
    df1_variant_column = list(df1.columns)[0]

    # Gets the name of the column of the variants' pathogenicity add_to.
    mf.print_columns_with_index(df2)
    df2_column_number = mf.get_int_answer(
        'What column contains the pathogenicity in the second file? ')
    df2_column_name = list(df2.columns)[df2_column_number - 1]
    df2_variant_column = list(df2.columns)[0]

    # Gets the number of variants to transfer and of what pathogenicity.
    answer = input('Move pathogenic or benign variants (p/b)? ')
    number_to_move = mf.get_int_answer('Move how many variants? ')

    # Splits take_from into two parts, one containing only pathogenic variants and the other only benign.
    df1_pathogenic = df1[(df1[df1_column_name] == 'Pathogenic') |
                         (df1[df1_column_name] == 'Likely_pathogenic')]
    df1_benign = df1[(df1[df1_column_name] == 'Benign') |
                     (df1[df1_column_name] == 'Likely_benign')]

    if (answer == 'p'):  # If the user wants to move pathogenic variants.
        # Randomly selects number_to_move pathogenic variants from take_from's pathogenic half.
        number_of_variants = df1_pathogenic.shape[0]
        indices = random.sample(range(number_of_variants), number_to_move)
        everything_else = [
            i for i in range(number_of_variants) if i not in indices
        ]

        # Creates a dataframe of the pathogenic variants to move. Renames the columns to match add_to's.
        df1_to_move = df1_pathogenic.iloc[indices][[
            df1_variant_column, df1_column_name
        ]]
        df1_to_move = df1_to_move.rename(index=str,
                                         columns={
                                             df1_variant_column:
                                             df2_variant_column,
                                             df1_column_name: df2_column_name
                                         })

        # Generates the two modified dataframes, and generates the two new csv files.
        df_out_2 = df2[[df2_variant_column,
                        df2_column_name]].append(df1_to_move,
                                                 ignore_index=True)
        df_out_1 = df1_benign.append(df1_pathogenic.iloc[everything_else])
        df_out_1.to_csv(modified_take_from, index=False)
        df_out_2.to_csv(modified_add_to, index=False)
    else:  # If the user wants to move benign variants.
        # Randomly selects number_to_move benign variants from take_from's benign half.
        number_of_variants = df1_benign.shape[0]
        indices = random.sample(range(number_of_variants), number_to_move)
        everything_else = [
            i for i in range(number_of_variants) if i not in indices
        ]

        # Creates a dataframe of the benign variants to move. Renames the columns to match add_to's.
        df1_to_move = df1_benign.iloc[indices][[
            df1_variant_column, df1_column_name
        ]]
        df1_to_move = df1_to_move.rename(index=str,
                                         columns={
                                             df1_variant_column:
                                             df2_variant_column,
                                             df1_column_name: df2_column_name
                                         })

        # Generates the two modified dataframes, and generates the two new csv files.
        df_out_2 = df2[[df2_variant_column,
                        df2_column_name]].append(df1_to_move,
                                                 ignore_index=True)
        df_out_1 = df1_pathogenic.append(df1_benign.iloc[everything_else])
        df_out_1.to_csv(modified_take_from, index=False)
        df_out_2.to_csv(modified_add_to, index=False)
예제 #7
0
import MiscFunctions as mf
import random # For pseudo-random numbers
import time # To set the seed
#------------------------------------------------------------------------------------------------

## Move a specified number of pathogenic or benign variants from a csv file to another file.
#      this doesn't modify the original files, rather creates new ones with new data.
#  @param take_from          : The csv-like file to remove variants from.
#  @param add_to             : The csv-like file to add variants to.
#  @param modified_take_from : The modified version of take_from
#  @param modified_add_to    : The modified version of add_to
#
def MoveVariants(take_from, add_to, modified_take_from, modified_add_to)
	# Create dataframes from take_from and add_to. I will refer to the dataframes by their
	# respective filenames.
	sep1 = mf.determine_separator(take_from)
	sep2 = mf.determine_separator(add_to)
	df1 = pd.read_csv(take_from, sep=sep1, header=0)
	df2 = pd.read_csv(add_to, sep=sep2, header=0)

	# Gets the name of the column of the variants' pathogenicity in take_from.
	mf.print_columns_with_index(df1)
	df1_column_number = mf.get_int_answer('What column contains the pathogenicity in the first file? ')
	df1_column_name = list(df1.columns)[df1_column_number - 1]
	df1_variant_column = list(df1.columns)[0]

	# Gets the name of the column of the variants' pathogenicity add_to.
	mf.print_columns_with_index(df2)
	df2_column_number = mf.get_int_answer('What column contains the pathogenicity in the second file? ')
	df2_column_name = list(df2.columns)[df2_column_number - 1]
	df2_variant_column = list(df2.columns)[0]
예제 #8
0
def Optimization(data_file):
    # Creates a dataframe from data_file.
    sep = mf.determine_separator(sys.argv[1])
    df = pd.read_csv(sys.argv[1], sep=',')

    scores = (df['Score'].values).copy()
    scores.sort()

    # Sets the resolution for the plot
    granularity = 1
    for index in range(len(scores) - 1):
        difference = scores[index + 1] - scores[index]
        if (difference < granularity):
            granularity = difference
    if (granularity < .001):
        granularity = .001

    # Creates a partition of the interval [0,1]. Creates two empty lists; one will store the accuracy
    # and the other will store the threshold.
    thresholds = [
        float(i) * granularity for i in range(int(1 / granularity) + 1)
    ]
    x = []
    y = []

    # Determines what term will be considered positive and which will be negative.
    positive_term = 'Pathogenic'
    negative_term = 'Benign'
    total = df['Pathogenicity'].shape[0]
    print(total)

    # For each threshold in the partition, determines the accuracy of the predictor.
    for threshold in thresholds:
        true_positives = 0
        true_negatives = 0
        for index, row in df.iterrows():
            score = float(row.values[2])
            if ((score >= threshold) & (row.values[1] == positive_term)):
                true_positives += 1
            elif ((score < threshold) & (row.values[1] == negative_term)):
                true_negatives += 1
        accuracy = (true_positives + true_negatives) / total
        x.append(threshold)
        y.append(accuracy)

    # Prints the threshold(s) that optimizes the accuracy and computes their average.
    max_values = [x[index] for index, value in enumerate(y) if value == max(y)]
    print('Resolution : ' + str(granularity))
    print('Max        : ' + str(max(y)))
    print('Mean       : ' + str(stat.mean(max_values)))
    print('Median     : ' + str(stat.median(max_values)))

    #------------------------------------------------------------------------------------------------
    # Creates an empty figure, with total area of 1.
    figure = plt.figure(figsize=(6, 6))
    ax = figure.add_subplot(1, 1, 1)
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.xaxis.set_major_locator(MultipleLocator(0.250))
    ax.yaxis.set_major_locator(MultipleLocator(0.250))
    ax.xaxis.set_minor_locator(MultipleLocator(0.125))
    ax.yaxis.set_minor_locator(MultipleLocator(0.125))
    ax.tick_params(which='major', length=10.0)
    ax.tick_params(which='minor', length=5.0)
    ax.set_ylabel('Accuracy')
    ax.set_xlabel('Threshold')

    # Draws the grid and plots the response curve.
    ax.grid(linestyle=':', linewidth=0.5, color='black')
    ax.plot(x, y, color='blue', linewidth='2.0')
    ax.plot(
        [stat.median(max_values),
         stat.median(max_values)],
        [0, max(y)],
        color='red',
        linestyle='--',
    )

    plt.title('Accuracy of EA on Training Set as a Function of Threshold')
    plt.show()
예제 #9
0
def RocPlot(data_file):
    # Determines the separator type associated with the file format of the data file.
    sep = mf.determine_separator(data_file)
    df = pd.read_csv(sys.argv[1], sep=sep)

    scores = (df['Score'].values).copy()
    binary_classifications = df['Pathogenicity'].replace({
        'Pathogenic': 1,
        'Benign': 0
    })
    AUC = roc_auc_score(list(binary_classifications.values), scores)
    scores.sort()

    # Sets the resolution for the plot
    granularity = 1
    for index in range(len(scores) - 1):
        difference = scores[index + 1] - scores[index]
        if (difference < granularity):
            granularity = difference
    if (granularity < .001):
        granularity = .001

    # Creates a partition of the interval [0,1]. Creates two empty lists; one will store the true
    # positive rate and the other will store the false positive rates.
    thresholds = [
        float(i) * granularity for i in range(int(1 / granularity) + 1)
    ]
    x = []
    y = []

    # Determines what term will be considered positive and which will be negative.
    positive_term = 'Pathogenic'
    negative_term = 'Benign'
    positives = df[df['Pathogenicity'] == positive_term].shape[0]
    negatives = df[df['Pathogenicity'] == negative_term].shape[0]

    # For each threshold in the partition, determines the true positive rate and the true negative
    # rate for the data.
    for threshold in thresholds:
        true_positives = 0
        false_positives = 0
        for index, row in df.iterrows():
            score = float(row.values[2])
            if ((score >= threshold) & (row.values[1] == positive_term)):
                true_positives += 1
            elif ((score >= threshold) & (row.values[1] == negative_term)):
                false_positives += 1

        true_positive_rate = true_positives / positives
        false_positive_rate = false_positives / negatives

        x.append(false_positive_rate)
        y.append(true_positive_rate)

    #------------------------------------------------------------------------------------------------
    # Creates an empty figure, with total area of 1.
    figure = plt.figure(figsize=(6, 6))
    ax = figure.add_subplot(1, 1, 1)
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.xaxis.set_major_locator(MultipleLocator(0.250))
    ax.yaxis.set_major_locator(MultipleLocator(0.250))
    ax.xaxis.set_minor_locator(MultipleLocator(0.125))
    ax.yaxis.set_minor_locator(MultipleLocator(0.125))
    ax.tick_params(which='major', length=10.0)
    ax.tick_params(which='minor', length=5.0)
    ax.set_ylabel('True Positive Rate')
    ax.set_xlabel('False Positive Rate')

    # Draws the grid and plots the ROC curce and the baseline curve.
    ax.grid(linestyle=':', linewidth=0.5, color='black')
    ax.plot(x, y, color='blue', linewidth='3.0')
    ax.plot([0, 1], [0, 1], color='orange', linestyle='--')
    plt.title('ROC Plot of REVEL on Test Set')
    ax.legend(['AUC: ' + str(round(AUC, 3))], loc=4)
    plt.show()