Ejemplo n.º 1
0
def parse_roguev(input_emap, output, verbose, map_from, map_to):
    df = pd.read_csv(input_emap, sep='\t', header=0, index_col=0)
    df = df.astype(float)

    print('* Loading from: ', input_emap)
    rows = df.index.str.upper().values
    cols = df.columns.str.upper().values

    # check prefixes
    print('\t- genes in cols with prefix in rows')
    print(has_prefix(rows, cols))
    print('\t- genes in rows with predix in cols')
    print(has_prefix(cols, rows))

    assert len(map_from) == len(map_to)

    if len(map_from) > 0:
        map_dict = dict(zip(map_from, map_to))
        print('\t- Mapping (fixing) row genes')
        rows = map_names(map_dict, rows)
        print('\t- Mapping (fixing) col genes')
        cols = map_names(map_dict, cols)

    is_sym = len(rows) == len(cols) and all(rows == cols)
    print('\t- Data is symmetric?', is_sym)

    rowset = set(rows)
    colset = set(cols)
    l = rowset - colset
    r = colset - rowset
    n = rowset & colset
    print('\t- GIs shape:', df.shape)
    print('\t- rows : both : cols')
    print('\t- {} : {} : {}'.format(len(l), len(n), len(r)))
    if verbose:
        print('\t\t- row only genes:', sorted(l))
        print('\t\t- col only genes:', sorted(r))

    print('\t- Sparsity:', sparsity(df.values))
    print('\t- Rows have no duplicates?', len(rows) == len(rowset))
    print('\t- Cols have no duplicates?', len(cols) == len(colset))

    assert (len(rows) == len(rowset)), 'rows have duplicates'
    assert (len(cols) == len(colset)), 'cols have duplicates'

    gi_data = GIData(values=df.values.astype(float),
                     rows=df.index.values.astype(str),
                     cols=df.columns.values.astype(str),
                     check_symmetric=is_sym)

    print('* Saving to:', output)
    gi_data.save(output)
Ejemplo n.º 2
0
def parse_roguev(input_emap, output):
    '''
    In the Raw data Column and gene names look like:
    for deleted genes:
        - SPBC32F12.02(rec14SKI8)
    for DaMPed genes:
        - SPBC32F12.02(rec14SKI8) - DAMP
    The raw tsv file is symmetric, with the same rows and columns.
    One gene in the raw data is both DaMP and deleted. 
    We keep the deletion strain and drop the DAMPed strain.
    '''

    #Load initial GI matrix
    df = pd.read_csv(input_emap, sep='\t', header=0, index_col=0)
    df = df.astype(float)

    #report initial matrix dimension
    print('Loaded initial %s x %s E-MAP' % df.shape)
    print('* Raw data:')
    print(df.head())
    print('\t- shape:', df.shape)

    assert (np.all(df.index.values == df.columns.values))

    # Array of DAMP/DEL labels
    mut_type_labels = np.repeat('DELETION', len(df))
    is_damped = df.index.str.endswith('DAMP')
    mut_type_labels[is_damped] = 'DAMP'

    genes = df.index.map(get_orf)
    genes = genes.str.upper()
    multi_index = pd.DataFrame(dict(Genes=genes, Mutation=mut_type_labels))
    multi_index = pd.MultiIndex.from_frame(multi_index,
                                           names=['Gene', 'Mutation'])
    df.columns = multi_index
    df.index = multi_index

    print('* Duplicated entries:', multi_index_duplicated(df.index, 'Gene'))
    del_df = df.loc[df.index.get_level_values('Mutation') == 'DELETION',
                    df.columns.get_level_values('Mutation') == 'DELETION']

    print('* Keeping only deleted genes')

    gis = GIData.from_multiIndexDF(del_df)
    gis.save(output)

    print('* Processed GIs shape:', gis.shape)
    print('* Processed GIs sparsity:', sparsity(gis.values))

    pass
Ejemplo n.º 3
0
def parse_emap(input_emap, output):
    #Read raw data
    df = pd.read_csv(input_emap, sep ='\t',
                    header=None,
                    index_col=None)

    names = ['Gene', 'Mutation', 'Marker']

     # Get first three columns and first three rows for multi-index
    col_idxs = df.iloc[:3, 3:].T
    row_idxs = df.iloc[3:, :3]
    df = df.iloc[3:, 3:]
    col_multiIndex = pd.MultiIndex.from_frame(col_idxs, names = names)
    row_multiIndex = pd.MultiIndex.from_frame(row_idxs, names = names)
    df.index = row_multiIndex
    df.columns = col_multiIndex
    df = df.astype(np.float)

    # Some simple reporting
    print('* Processing data from: ', input_emap)
    print('* Raw data:')
    print(df.head())
    print('* Markers on columns:', df.columns.get_level_values('Marker').unique().values)
    print('* Markers on rows:', df.index.get_level_values('Marker').unique().values)
    print('* Mutations on columns:', df.columns.get_level_values('Mutation').unique().values)
    print('* Mutations on rows:', df.index.get_level_values('Mutation').unique().values)
    print()

    ### Get DELETIONS only dataframe:
    print('* Retrieving GI matrix with only deletions')

    del_df = df.loc[df.index.get_level_values('Mutation') == 'DELETION', 
                    df.columns.get_level_values('Mutation') == 'DELETION']
    
    gis = GIData.from_multiIndexDF(del_df)
    gis.save(output)
    
    print('* Processed GIs shape:', gis.shape)
    print('* Processed GIs sparsity:', sparsity(gis.values))
Ejemplo n.º 4
0
def cli(fp, output_fmt, gis_only):
    # Renamed columns with shorter names...
    value_cols = [
        'GI_score', 'GI_pval', 'query_SMF', 'array_SMF',
        'double_mutant_fitness', 'double_mutant_fitness_std'
    ]
    if gis_only:
        # option for development, parse only GI scores and not all the other values
        value_cols = ['GI_score']

    # Load GI pairs
    _df = load_gis(fp)

    print(_df.head())
    print('* Read {} interactions'.format(len(_df)))
    print('\t- Total query genes: ', len(set(_df.query_orf)))
    print('\t- Total array genes: ', len(set(_df.array_orf)))

    print('* For each gene, getting query strain with most interactions')
    array_types = _df.array_type.unique()
    for array_type in array_types:
        print('\t- Getting array type: ', array_type)
        df = _df[_df.array_type == array_type]

        print('\t- getting best query strains')
        strains = get_best_strains(df.query_strain_ID, df.query_orf)
        df = df[df.query_strain_ID.isin(strains)]

        print('\t- getting best array strains')
        strains = get_best_strains(df.array_strain_ID, df.array_orf)
        df = df[df.array_strain_ID.isin(strains)]

        assert (len(set(df.query_strain_ID)) == len(set(df.query_orf)))
        assert (len(set(df.array_strain_ID)) == len(set(df.array_orf)))

        print('\t- Total query genes: ', len(set(df.query_orf)))
        print('\t- Total array genes: ', len(set(df.array_orf)))

        df = df.pivot(index='query_orf', columns='array_orf')
        for value_name in value_cols:
            gi_mat = df[value_name]
            print('\t-Extracting {} values (array type: {})'.format(
                value_name, array_type))
            print('\t\t- GIs shape:', gi_mat.shape)

            gi_mat = gi_mat.apply(pd.to_numeric)

            print('\t\t- GIs shape:', gi_mat.shape)
            print('\t\t- Sparsity:', sparsity(gi_mat.values))
            print('\t\t- Interactions measured :',
                  np.sum(~np.isnan(gi_mat.values)))
            print('\t\t- Interactions missing:',
                  np.sum(np.isnan(gi_mat.values)))

            gi_data = GIData(values=gi_mat.values.astype(float),
                             rows=gi_mat.index.values.astype(str),
                             cols=gi_mat.columns.values.astype(str),
                             check_symmetric=False)
            fp = output_fmt.format(value_name, array_type)

            gi_data.save(fp)
            print('\t\t- Saved values to:', fp)
Ejemplo n.º 5
0
def cli(data_fp, pval_fp, gis_output, pvals_output):
    print('* Loading data')

    print('\t- GIs from:', data_fp)

    gi_data = cpkl_load(data_fp)

    if pval_fp:
        print('\t- p-values from:', pval_fp)
        pval_data = cpkl_load(pval_fp)

    rows = gi_data['rows']
    cols = gi_data['cols']

    print('\t- # rows', len(rows))
    print('\t- # cols', len(cols))

    # all_genes = np.unique(np.concatenate((rows, cols)))
    # all_genes.sort()
    # N = len(all_genes)

    # print('\t- # total genes', N)

    row_g2i = dict((g, i) for i, g in enumerate(rows))
    col_g2i = dict((g, i) for i, g in enumerate(cols))

    asym_values = gi_data['values']
    values = asym_values.copy()
    print('\t- Sparsity: ', sparsity(asym_values))

    if pval_fp:
        asym_pvals = pval_data['values']
        pvals = asym_pvals.copy()

    for i, A in enumerate(rows):
        for j, B in enumerate(cols):
            # in row, col |  and col, row

            # indices in asym data
            A_r = row_g2i.get(A)
            B_c = col_g2i.get(B)
            A_c = col_g2i.get(A)
            B_r = row_g2i.get(B)

            if (A_r is not None) and \
               (B_c is not None) and \
               (A_c is not None) and \
               (B_r is not None):
                v_ij = asym_values[A_r, B_c]
                v_ji = asym_values[B_r, A_c]

                if v_ij * v_ji < 0:
                    # opposite signs...
                    values[A_r, B_c] = np.nan
                    values[B_r, A_c] = np.nan
                    if pval_fp:
                        pvals[A_r, B_c] = np.nan
                        pvals[B_r, A_c] = np.nan
                else:
                    if pval_fp:
                        p_ij = asym_pvals[A_r, B_c]
                        p_ji = asym_pvals[B_r, A_c]
                        p = min(p_ij, p_ji)
                        v = v_ij if p_ij < p_ji else v_ji
                        pvals[A_r, B_c] = p
                        pvals[B_r, A_c] = p
                        values[A_r, B_c] = v
                        values[B_r, A_c] = v
                    else:
                        v = (v_ij + v_ji) / 2

                        values[A_r, B_c] = v
                        values[B_r, A_c] = v
            # elif (A_r is not None) and \
            #      (B_c is not None):
            #     if pval_fp:
            #         p = asym_pvals[A_r, B_c]
            #     v = asym_values[A_r, B_c]
            # elif (A_c is not None) and \
            #      (B_r is not None):
            #     if pval_fp:
            #         p = asym_pvals[B_r, A_c]
            #     v = asym_values[B_r, A_c]
            # else:
            #     continue

            # values[i,j] = v
            # values[j,i] = v

            # if pval_fp:
            #     pvals[i,j] = p
            #     pvals[j,i] = p

    print('* Processed: summary stats')
    print('\t- Shape:', values.shape)
    print('\t- Sparsity: ', sparsity(values))
    #    print('\t- total unique pairs:', np.sum(~np.isnan(values) / 2))

    processed_gis = GIData(values=values.astype(float),
                           rows=rows.astype(str),
                           cols=cols.astype(str),
                           check_symmetric=False)
    processed_gis.save(gis_output)

    if pval_fp:
        processed_pvals = GIData(values=pvals.astype(float),
                                 rows=rows.astype(str),
                                 cols=cols.astype(str),
                                 check_symmetric=False)
        processed_pvals.save(pvals_output)
Ejemplo n.º 6
0
def cli(onto_fp, dcell_fp, costanzo_2010_fp, cpkl_output_fmt, tsv_output):

    onto_raw_preds_fp = onto_fp
    df = pd.read_csv(onto_raw_preds_fp, sep='\t')
    df.columns = ['A', 'B', 'onto_pred', 'true', 'p']
    df = df[~df['true'].isna()].reset_index(drop=True)
    onto_df = df.sort_values(by=['A', 'B'])
    print(onto_df.head())
    print('Ontotype prediction data shape:', onto_df.shape)

    raw_dcell_preds = dcell_fp
    dcell_df = pd.read_csv(raw_dcell_preds, sep='\t', header=None)
    dcell_df.columns = ['A', 'B', 'dcell_pred']
    dcell_df = dcell_df.sort_values(by=['A', 'B'])
    print(dcell_df.head())
    print('DCell prediction data shape:', dcell_df.shape)
    onto_df = sort_cols(onto_df)
    dcell_df = sort_cols(dcell_df)

    keys = list(zip(onto_df['A'], onto_df['B']))
    keys = [a + ':' + b for (a, b) in keys]
    print('Ontotype preds have duplicate preds for gene pairs:',
          not len(keys) == len(set(keys)))

    keys = list(zip(dcell_df['A'], dcell_df['B']))
    keys = [a + ':' + b for (a, b) in keys]
    print('DCell preds have duplicate preds for gene pairs:',
          not len(keys) == len(set(keys)))

    print('Dropping duplicates in DCell preds')
    dcell_df = dcell_df.drop_duplicates(['A', 'B'])

    keys = list(zip(dcell_df['A'], dcell_df['B']))
    keys = [a + ':' + b for (a, b) in keys]
    print('DCell preds have duplicate preds for gene pairs:',
          not len(keys) == len(set(keys)))

    print(
        'Performing inner join to merge DCell and Ontotype predictions... (validating that join is 1-1)'
    )
    merged_df = onto_df.merge(dcell_df,
                              how='inner',
                              left_on=['A', 'B'],
                              right_on=['A', 'B'],
                              validate='1:1')
    print(merged_df.head())
    print('** Ontotype: **')
    report(merged_df, 'onto_pred')

    print('** DCell: **')
    report(merged_df, 'dcell_pred')

    merged_df.to_csv(tsv_output, sep='\t', index=False)

    value_cols = ['onto_pred', 'dcell_pred', 'true', 'p']

    #TODO: we need to make this data match the shape of the data with our own processed Gs.
    # That is, we need to figure whether a pair is
    # A) query - array
    # b) array - query (so we swap)
    # c) or both array and query, then we add two values to the matrix.

    # check for duplicates
    # keys = list(zip(merged_df['A'], merged_df['B']))
    # keys = [frozenset((a,b)) for (a,b) in keys]
    # assert(len(keys) == len(set(keys)))
    # print('Merged scores are indexed by unique keys:', len(keys) == len(set(keys)))

    with open(costanzo_2010_fp, 'rb') as f:
        costanzo10_data = cpkl.load(f)

    costanzo10_rows = set(costanzo10_data['rows'])
    costanzo10_cols = set(costanzo10_data['cols'])
    print(len(costanzo10_rows), len(costanzo10_cols))

    needs_duplicates = df['A'].isin(costanzo10_rows) & df['A'].isin(costanzo10_cols) & \
                       df['B'].isin(costanzo10_rows) & df['B'].isin(costanzo10_cols)
    print(sum(needs_duplicates))

    # Incorrectly oriented (i.e. A is not in rows, or B is not in cols)
    needs_flipping = (~df['A'].isin(costanzo10_rows)) | (
        ~df['B'].isin(costanzo10_cols))
    print(sum(needs_flipping))

    # Correcly oriented and does not need duplicating
    # keep =  df['A'].isin(costanzo10_rows) & (~df['A'].isin(costanzo10_cols)) & \
    #         df['B'].isin(costanzo10_cols) & (~df['B'].isin(costanzo10_rows))

    keep = ~(needs_flipping | needs_duplicates)
    print(sum(keep))

    dup_df = merged_df[needs_duplicates]
    dup_df.rename(columns={'A': 'B', 'B': 'A'}, inplace=True)

    flip_df = merged_df[needs_flipping]
    flip_df.rename(columns={'A': 'B', 'B': 'A'}, inplace=True)

    new_df = merged_df[needs_duplicates]
    new_df = new_df.append(dup_df)
    new_df = new_df.append(flip_df)
    new_df = new_df.append(merged_df[keep])
    print(new_df.head())
    print('row genes', len(set(new_df['A'])))
    print('col genes', len(set(new_df['B'])))

    print(len(new_df))

    new_df = new_df.pivot(index='A', columns='B')

    for value_name in value_cols:
        gi_mat = new_df[value_name]
        print('\t-Extracting {} values'.format(value_name))
        print('\t\t- GIs shape:', gi_mat.shape)

        gi_mat = gi_mat.apply(pd.to_numeric)

        print('\t\t- GIs shape:', gi_mat.shape)
        print('\t\t- Sparsity:', sparsity(gi_mat.values))
        print('\t\t- Interactions measured :',
              np.sum(~np.isnan(gi_mat.values)))
        print('\t\t- Interactions missing:', np.sum(np.isnan(gi_mat.values)))

        gi_data = GIData(values=gi_mat.values.astype(float),
                         rows=gi_mat.index.values.astype(str),
                         cols=gi_mat.columns.values.astype(str),
                         check_symmetric=False)
        fp = cpkl_output_fmt.format(value_name)

        gi_data.save(fp)
        print('\t\t- Saved values to:', fp)
Ejemplo n.º 7
0
        print('\t\t- {} DAMP genes on rows'.format(np.sum(damp_rows)))
        gi_mat = gi_mat.iloc[~damp_rows, ~damp_cols]
        print('\t- GIs shape:', gi_mat.shape)

    # Remove TS alleles genes
    if not args.use_ts_genes:
        print('\t* Removing TS genes')
        ts_regex = '.+_tsq.*'
        ts_cols = gi_mat.columns.str.match(ts_regex)
        ts_rows = gi_mat.index.str.match(ts_regex)
        print('\t\t- {} TS genes on cols'.format(np.sum(ts_cols)))
        print('\t\t- {} TS genes on rows'.format(np.sum(ts_rows)))
        gi_mat = gi_mat.iloc[~ts_rows, ~ts_cols]
        print('\t- GIs shape:', gi_mat.shape)

    gi_mat = gi_mat.apply(pd.to_numeric)
    assert (len(gi_mat.index) == len(set(gi_mat.index)))
    assert (len(gi_mat.columns) == len(set(gi_mat.columns)))

    print('\t- GIs shape:', gi_mat.shape)
    print('\t- Sparsity:', sparsity(gi_mat.values))
    print('\t- Interactions measured :', np.sum(~np.isnan(gi_mat.values)))
    print('\t- Interactions missing:', np.sum(np.isnan(gi_mat.values)))

    gi_data = GIData(values=gi_mat.values.astype(float),
                     rows=gi_mat.index.values.astype(str),
                     cols=gi_mat.columns.values.astype(str),
                     check_symmetric=False)
    fp = args.output_fmt.format(value_name)
    gi_data.save(fp)
    print('\t- Saved values to:', fp)