Esempio n. 1
0
def ExtractPlotScores(classification_file, score_file):
	# Determines the separators for both files.
	sep1 = mf.determine_separator(classification_file)
	sep2 = mf.determine_separator(score_file)

	# Creates dataframes from both files.
	pathogenicity_dataframe = pd.read_csv(classification_file, sep=sep1, header=0)
	score_dataframe = pd.read_csv(score_file, sep=sep2, header=0)

	# Inquires about which column in the curated file contains pathogenicity data. Sets the variant
	# identifier to the first column.
	mf.print_columns_with_index(pathogenicity_dataframe)
	pathogenicity_index  = mf.get_int_answer('What column states the pathogenicity? ')
	pathogenicity_column = list(pathogenicity_dataframe.columns)[pathogenicity_index - 1]
	coordinate_index_1  = mf.get_int_answer('What column identifies the variant? ')
	genomic_coordinate_column1 = list(pathogenicity_dataframe.columns)[coordinate_index_1 - 1]

	# Inquires about which column in the output file of the predictor method contains pathogenicity
	# scores. Sets the variant identifier to the first column, which must match a value in the first
	# column of the curated file.
	mf.print_columns_with_index(score_dataframe)
	score_index = mf.get_int_answer('What column gives the scores? ')
	score_column = list(score_dataframe.columns)[score_index - 1]
	coordinate_index_2  = mf.get_int_answer('What column identifies the variant? ')
	genomic_coordinate_column2 = list(score_dataframe.columns)[coordinate_index_2 - 1]

	# Creates a dictionary to store each variant's pathogenicity and pathogenicity score. Creates
	# a list that stores each variant identifier associated with a reported score.
	scores = {'pathogenicity': [], 'scores': []}
	chromosome_list = []

	for index, row in score_dataframe.iterrows(): # Iterates through each variant in the output file.
		# Determines if the given variant in the outfile has a score. If so, the variable
		# 'after' contains the variant's score. Otherwise, 'after' is empty.
		to_parse = score_dataframe[score_column].iloc[index]
		phrase = 'REVEL_score='
		before, at, after = to_parse.partition(phrase)

		if ( after != ''): # If the variant reports a score.
			# And the variant has no previously reported score.
			if ( score_dataframe[genomic_coordinate_column2].iloc[index] not in chromosome_list ):
				# Add the variant to the list of scored variants. Then append the score to the list
				# of scores and append the variants pathogenicity to the list of pathogencities.
				chromosome_list.append(score_dataframe[genomic_coordinate_column2].iloc[index])
				scores['scores'].append(float(after))
				genomic_coordinate = chromosome_list[-1]
				pathogenicity = pathogenicity_dataframe[pathogenicity_dataframe[genomic_coordinate_column1] == genomic_coordinate][pathogenicity_column]
				if ( (pathogenicity.values[0] == 'Benign') | (pathogenicity.values[0] == 'Likely_benign') ):
					scores['pathogenicity'].append("Benign")
				elif ( (pathogenicity.values[0] == 'Pathogenic') | (pathogenicity.values[0] == 'Likely_pathogenic') ):
					scores['pathogenicity'].append('Pathogenic')

	# Construct a dataframe from the dictionary, and then we create a boxplot from the data.
	df = pd.DataFrame.from_dict(scores)
	if ( len(sys.argv) == 4 ):
		df.to_csv(output_file)
	print('variants scored:', df.shape[0], '/', pathogenicity_dataframe.shape[0])
	#  , '/', len((pathogenicity_dataframe.shape())[1])
	boxplt = df.boxplot(by='pathogenicity')
	plt.show()
Esempio n. 2
0
def DataExtraction(read_file, write_file):
    # Creates a dataframe from read_file and opens a new file for writing.
    sep = mf.determine_separator(read_file)
    df = pd.read_csv(read_file, sep=sep, header=0)
    write_file = open(write_file, 'w+')

    # Prints the columns of the file to the console. Gets the name of the column to extract
    # variant identifiers from. Optionally writes the pathogenicity into a second column.
    mf.print_columns_with_index(df)
    col1 = mf.get_int_answer('What is the number of the column to sort by? ')
    coordinate_column = list(df.columns)[col1 - 1]
    df.sort_values(by=df.columns[col1 - 1])
    first_question = mf.get_yes_no('Write pathogenicity (y/n)? ')
    if ((first_question == 'y') | (first_question == "Y")):
        second_question = mf.get_int_answer('Pathogenicity\'s column number? ')
        pathogenicity_column = list(df.columns)[second_question - 1]

    # Writes the variant identifier and the pathogencity to output_file.
    for index, row in df.iterrows():
        write_file.write(df[coordinate_column].iloc[index] + ',')
        if ((first_question == 'y') | (first_question == "Y")):
            write_file.write(df[pathogenicity_column].iloc[index])
        write_file.write('\n')

    write_file.close()
def CountUniqueElementsColumn(read_file):
    # Creates a csv file from read_file.
    sep = mf.determine_separator(read_file)
    df = pd.read_csv(read_file, sep=sep, header=0)

    # Print the columns of read_file to the console. Gets the name of the column to be printed
    # from the user.
    mf.print_columns_with_index(df)
    column_number = mf.get_int_answer('What column should be accumulated? ')
    column_name = list(df.columns)[column_number - 1]

    # Creates a list of the unique values from the specified column. Prints its length, and
    # optionally each item and its frequency.
    unique_values = pd.unique(df[column_name])
    print('Number of unique items: ' + str(len(unique_values)))
    show_items = mf.get_yes_no('Show item/counts (y/n)? ')
    if ((show_items == 'y') | (show_items == 'Y')):
        for item in unique_values:
            print(str(item) + ': ' + str(df[df[column_name] == item].shape[0]))
Esempio n. 4
0
def MoveVariants(take_from, add_to, modified_take_from, modified_add_to):
    # Create dataframes from take_from and add_to. I will refer to the dataframes by their
    # respective filenames.
    sep1 = mf.determine_separator(take_from)
    sep2 = mf.determine_separator(add_to)
    df1 = pd.read_csv(take_from, sep=sep1, header=0)
    df2 = pd.read_csv(add_to, sep=sep2, header=0)

    # Gets the name of the column of the variants' pathogenicity in take_from.
    mf.print_columns_with_index(df1)
    df1_column_number = mf.get_int_answer(
        'What column contains the pathogenicity in the first file? ')
    df1_column_name = list(df1.columns)[df1_column_number - 1]
    df1_variant_column = list(df1.columns)[0]

    # Gets the name of the column of the variants' pathogenicity add_to.
    mf.print_columns_with_index(df2)
    df2_column_number = mf.get_int_answer(
        'What column contains the pathogenicity in the second file? ')
    df2_column_name = list(df2.columns)[df2_column_number - 1]
    df2_variant_column = list(df2.columns)[0]

    # Gets the number of variants to transfer and of what pathogenicity.
    answer = input('Move pathogenic or benign variants (p/b)? ')
    number_to_move = mf.get_int_answer('Move how many variants? ')

    # Splits take_from into two parts, one containing only pathogenic variants and the other only benign.
    df1_pathogenic = df1[(df1[df1_column_name] == 'Pathogenic') |
                         (df1[df1_column_name] == 'Likely_pathogenic')]
    df1_benign = df1[(df1[df1_column_name] == 'Benign') |
                     (df1[df1_column_name] == 'Likely_benign')]

    if (answer == 'p'):  # If the user wants to move pathogenic variants.
        # Randomly selects number_to_move pathogenic variants from take_from's pathogenic half.
        number_of_variants = df1_pathogenic.shape[0]
        indices = random.sample(range(number_of_variants), number_to_move)
        everything_else = [
            i for i in range(number_of_variants) if i not in indices
        ]

        # Creates a dataframe of the pathogenic variants to move. Renames the columns to match add_to's.
        df1_to_move = df1_pathogenic.iloc[indices][[
            df1_variant_column, df1_column_name
        ]]
        df1_to_move = df1_to_move.rename(index=str,
                                         columns={
                                             df1_variant_column:
                                             df2_variant_column,
                                             df1_column_name: df2_column_name
                                         })

        # Generates the two modified dataframes, and generates the two new csv files.
        df_out_2 = df2[[df2_variant_column,
                        df2_column_name]].append(df1_to_move,
                                                 ignore_index=True)
        df_out_1 = df1_benign.append(df1_pathogenic.iloc[everything_else])
        df_out_1.to_csv(modified_take_from, index=False)
        df_out_2.to_csv(modified_add_to, index=False)
    else:  # If the user wants to move benign variants.
        # Randomly selects number_to_move benign variants from take_from's benign half.
        number_of_variants = df1_benign.shape[0]
        indices = random.sample(range(number_of_variants), number_to_move)
        everything_else = [
            i for i in range(number_of_variants) if i not in indices
        ]

        # Creates a dataframe of the benign variants to move. Renames the columns to match add_to's.
        df1_to_move = df1_benign.iloc[indices][[
            df1_variant_column, df1_column_name
        ]]
        df1_to_move = df1_to_move.rename(index=str,
                                         columns={
                                             df1_variant_column:
                                             df2_variant_column,
                                             df1_column_name: df2_column_name
                                         })

        # Generates the two modified dataframes, and generates the two new csv files.
        df_out_2 = df2[[df2_variant_column,
                        df2_column_name]].append(df1_to_move,
                                                 ignore_index=True)
        df_out_1 = df1_pathogenic.append(df1_benign.iloc[everything_else])
        df_out_1.to_csv(modified_take_from, index=False)
        df_out_2.to_csv(modified_add_to, index=False)
Esempio n. 5
0
#  @param take_from          : The csv-like file to remove variants from.
#  @param add_to             : The csv-like file to add variants to.
#  @param modified_take_from : The modified version of take_from
#  @param modified_add_to    : The modified version of add_to
#
def MoveVariants(take_from, add_to, modified_take_from, modified_add_to)
	# Create dataframes from take_from and add_to. I will refer to the dataframes by their
	# respective filenames.
	sep1 = mf.determine_separator(take_from)
	sep2 = mf.determine_separator(add_to)
	df1 = pd.read_csv(take_from, sep=sep1, header=0)
	df2 = pd.read_csv(add_to, sep=sep2, header=0)

	# Gets the name of the column of the variants' pathogenicity in take_from.
	mf.print_columns_with_index(df1)
	df1_column_number = mf.get_int_answer('What column contains the pathogenicity in the first file? ')
	df1_column_name = list(df1.columns)[df1_column_number - 1]
	df1_variant_column = list(df1.columns)[0]

	# Gets the name of the column of the variants' pathogenicity add_to.
	mf.print_columns_with_index(df2)
	df2_column_number = mf.get_int_answer('What column contains the pathogenicity in the second file? ')
	df2_column_name = list(df2.columns)[df2_column_number - 1]
	df2_variant_column = list(df2.columns)[0]

	# Gets the number of variants to transfer and of what pathogenicity.
	answer = input('Move pathogenic or benign variants (p/b)? ')
	number_to_move = mf.get_int_answer('Move how many variants? ')

	# Splits take_from into two parts, one containing only pathogenic variants and the other only benign.
	df1_pathogenic = df1[ (df1[df1_column_name] == 'Pathogenic') | (df1[df1_column_name] == 'Likely_pathogenic') ]