Esempio n. 1
0
def ExtractTestData(read_file, write_file):

    # Creates a dataframe from read_file and opens a new file for writing.
    df = pd.read_csv(read_file, sep='\t', header=0)
    write_file = open(write_file, 'w+')

    # Asks the user if they want to add a column for pathogenicity in write_file.
    extract_pathogenicity = mf.get_yes_no("Extract the pathogenicity (y/n)? ")
    answer_yes = (extract_pathogenicity == 'y') | (extract_pathogenicity
                                                   == 'Y')

    # Adds the column names.
    write_file.write('#Variant')
    if (answer_yes):
        write_file.write(',Pathogenicity')
    write_file.write('\n')

    # Adds all the variants that are single subsitutions and are not VUS's to write_file. Also,
    # adds the pathogenicity of the variant if the user replied yes.
    for index, row in df.iterrows():
        if (('>' in row.values[1]) & (not int(row.values[-1]) == 3)):
            if (row.values[1] == 'BRCA1'):
                write_file.write('NM_007294.3:' + str(row.values[1]))
            else:
                write_file.write('NM_000059.3:' + str(row.values[1]))
            if (answer_yes):
                if (int(row.values[-1]) > 3):
                    write_file.write(',' + 'Pathogenic')
                elif (int(row.values[-1]) < 3):
                    write_file.write(',' + 'Benign')
            write_file.write('\n')

    write_file.close()
Esempio n. 2
0
def DataExtraction(read_file, write_file):
    # Creates a dataframe from read_file and opens a new file for writing.
    sep = mf.determine_separator(read_file)
    df = pd.read_csv(read_file, sep=sep, header=0)
    write_file = open(write_file, 'w+')

    # Prints the columns of the file to the console. Gets the name of the column to extract
    # variant identifiers from. Optionally writes the pathogenicity into a second column.
    mf.print_columns_with_index(df)
    col1 = mf.get_int_answer('What is the number of the column to sort by? ')
    coordinate_column = list(df.columns)[col1 - 1]
    df.sort_values(by=df.columns[col1 - 1])
    first_question = mf.get_yes_no('Write pathogenicity (y/n)? ')
    if ((first_question == 'y') | (first_question == "Y")):
        second_question = mf.get_int_answer('Pathogenicity\'s column number? ')
        pathogenicity_column = list(df.columns)[second_question - 1]

    # Writes the variant identifier and the pathogencity to output_file.
    for index, row in df.iterrows():
        write_file.write(df[coordinate_column].iloc[index] + ',')
        if ((first_question == 'y') | (first_question == "Y")):
            write_file.write(df[pathogenicity_column].iloc[index])
        write_file.write('\n')

    write_file.close()
def CountUniqueElementsColumn(read_file):
    # Creates a csv file from read_file.
    sep = mf.determine_separator(read_file)
    df = pd.read_csv(read_file, sep=sep, header=0)

    # Print the columns of read_file to the console. Gets the name of the column to be printed
    # from the user.
    mf.print_columns_with_index(df)
    column_number = mf.get_int_answer('What column should be accumulated? ')
    column_name = list(df.columns)[column_number - 1]

    # Creates a list of the unique values from the specified column. Prints its length, and
    # optionally each item and its frequency.
    unique_values = pd.unique(df[column_name])
    print('Number of unique items: ' + str(len(unique_values)))
    show_items = mf.get_yes_no('Show item/counts (y/n)? ')
    if ((show_items == 'y') | (show_items == 'Y')):
        for item in unique_values:
            print(str(item) + ': ' + str(df[df[column_name] == item].shape[0]))