def get_other_variants(most_damaging, all_var, AB, Gene, Exon, Date):
    ''' Get a list of all sample names that contain a given false positive variant 
        or a variant which does not pass the threshold of the allele balance and 
        use it to get all other variants asociated with said sample.

    Args:   
        most_damaging: csv containg most damaging variants per sample
        all_var: all called variants assocaited with each sample referred to in most_damaging patients

    Returns:
        A modified all_vars df which has all the alternative variants for each
        sample within the most_damaging df except variants below the AB threshold
        and within false positives. Returned df is sorted by samle name and variant
        score.
        
    '''
    # copy and rename columns
    df = most_damaging.copy()
    df = df.fillna("-")
    df = rename.rename_columns(df)

    # remove samples that have all NaN entries in the fields of interest
    df['all_nan'] = df.apply(lambda x: "Y" if (x['Symbol'] == "-" and x['Exon'] == "-" 
                             and x['AB'] == "-") else "N", axis=1)
    df = df[~df['all_nan'].str.contains("Y")]

    # convert AB to numeric
    df['AB'] = pd.to_numeric(df['AB'], errors='coerce')

    # filter for variants with SKI exon1 and AB < 0.3
    df = identify_unwanted(df, AB, Gene, Exon, Date)   
    df = df[df['TEST'].str.contains("LOW", na=False)]
                                               
    # list of all sample names 
    l = df['Sample'].tolist()
    
    # copy and rename columns
    all_vars = all_var.copy()
    all_vars = rename.rename_columns(all_vars)
    
    # filter for only rows that contain sample name in the given list
    all_vars['cross'] = all_vars['Sample'].isin(l)
    all_vars = all_vars[all_vars['cross'] == True]
    
    # filter for variants with AB > 0.3 or aren't SKI exon 1
    all_vars = identify_unwanted(all_vars, AB, Gene, Exon, Date)
    all_vars = all_vars[~all_vars['TEST'].str.contains("LOW", na=False)]

    # sort in order of sample name and score
    all_vars = all_vars.sort_values(['Sample','Score'], ascending=False)

    return all_vars 
def clean_phenotype_data(phenotype):
    ''' Clean the phenotype data.
    Args: 
        phenotype: path to phenotype file
    '''
    unwanted_char = {' ': '', '-': '', '\'': ''}
    name_corrections = {
        '24SA1565': '21SA1565',
        '24SS1575': '21SS1575',
        '24GC1574': '21GC1574',
        '24DR1571': '21DR1571',
        '24FP1566': '21FP1566',
        '24GC1574': '21GC1574',
        '24AS1570': '21AS1570',
        '24KS0915': '24ZS0915',
        '1328': '21RL1328',
        '24DW932': '24DW0932',
        '926': '24GN0926',
        '931': '24SG0931',
        '937': '24CB0937',
        '1327': '21SN1327',
        '1374': '24AS1374',
        'MK3598': 'MK_35_98'
    }
    pdf = pd.read_csv(phenotype, encoding='iso-8859-1')
    pdf_clean = rename.rename_columns(pdf)
    pdf_clean = rename.replace_series_strings(pdf_clean,
                                              'Sample',
                                              unwanted_char,
                                              substring=True)
    pdf_clean = rename.replace_series_strings(pdf_clean,
                                              'Sample',
                                              name_corrections,
                                              substring=False)
    return pdf_clean
def create_new_most_damaging(old_most_dam, all_vars, AB=0.3, Gene="SKI", Exon="1/7", Date="01-Jul"):
    ''' Replace the most damaging variant for each patients variant whom
        does not pass the allele balance threshold or whoms variant is
        within a known false positive gene and exon. If the existing most 
        damaging variant is the only variant for that patient, then said 
        variant will remain as the most damaging.
        
    Args:
        old_most_dam: existing dataframe which details the most damaging variant for each patient
        all_vars: a dataframe which contains all variants associated with the patients detailed in old_most_dam
        AB: allele balance minimum threshold
        Gene: gene in which a known false positive lies within 
        Exon: exon of said gene in which a known false positive lies within
        Date: converting a xsxl to csv results in exon nums turning into dates i.e. 1/7 becomes 01-Jul. This ensures said exons are filtered if this is the case.

    Returns:
        The old_most_dam df where the next most damaging variant has been 
        put in place of the old most damaging variant that did not pass 
        the allele balance threshold or was within a known false positive
    '''
    # drop duplicates (dropping is fine as they have been sorted by score)
    all_alt_vars = get_other_variants(old_most_dam, all_vars, AB, Gene, Exon, Date)
    alt_most_dam = all_alt_vars.drop_duplicates(['Sample'])
    alt_most_dam['new_md'] = "Y"   # mark sample/variants 

    # rename columns so they match with alt_most_dam 
    old_most_dam = rename.rename_columns(old_most_dam)

    # append the newly selected most damaging, sort so these variants appear above old most damaging vars and drop duplicates so only new most damaging remain.
    append_most_dam = old_most_dam.append(alt_most_dam)
    append_most_dam = append_most_dam.sort_values(['Sample', 'new_md'])
    
    new_most_dam = append_most_dam.drop_duplicates('Sample')
    
    return new_most_dam
Beispiel #4
0
def get_phenotype_columns(p):
    ''' Open a phenotype file and return the
        column names as a list.
    Args:
        p - phenotype file
    '''
    p = rename.rename_columns(pd.read_csv(p, encoding='iso-8859-1'))
    phenotype_columns = list(p.columns.values)
    phenotype_columns.remove('Sample')
    return phenotype_columns
def clean_all_var_df(df, three_categories=True):
    ''' Clean up of the all variants data. '''
    df['Dup'] = df.apply(lambda x: mark_duplicate_samples(x, df), axis=1)
    df = df[~df['Dup'].str.contains("Duplicate")]
    df = conversion.convert2numeric(df, ['age at diagnosis'])
    # rename_columns performed in merge_genotype_phenotype
    df = rename.rename_columns(df)
    df = rename.rename_entries(df)
    df = nc.create_new_columns(df, three_categories)
    df = df[~df['Sample'].str.contains("Blank|blank|ddH20|dH2O|H2O|BLANK|ddh2o"
                                       )]
    df = sf.no_SKI_exon1(df)
    return df
def clean_genotype_data(genotype):
    ''' Clean the genotype data and filter for the
        genotype columns of interest
    Args:
        genotype: path to genotype file
    '''
    genotype_columns = [
        'Sample', 'AD', 'AB', 'UID', 'validation', 'Category', 'Score',
        'Symbol', 'HGVS', 'Chrom', 'Pos', 'Ref', 'Alt', 'Consequence', 'HGVSc',
        'HGVSp', 'Exon', 'Intron'
    ]
    gdf = pd.read_csv(genotype, encoding='iso-8859-1')
    gdf_clean = rename.rename_columns(gdf)
    mask = PLP2VUS(gdf_clean)
    gdf_clean.ix[mask, 'Category'] = 'Uncertain Significance'
    gdf_clean.ix[mask, 'New Category'] = 'VUS'
    gdf_filtered = gdf_clean[genotype_columns]
    return gdf_filtered