Example #1
0
def get_interaction_elements(interactions_df, interaction_id_column,
                             left_fragment_columns, right_fragment_columns,
                             left_elements_df, right_elements_df):
    left_element_name_column = left_elements_df.columns[-1]
    right_element_name_column = right_elements_df.columns[-1]
    assert left_element_name_column.endswith(
        '_name') and right_element_name_column.endswith('_name')

    # include interaction ids for re-merging pairs
    left_fragments_df = interactions_df[left_fragment_columns +
                                        [interaction_id_column]]
    right_fragments_df = interactions_df[right_fragment_columns +
                                         [interaction_id_column]]

    left_elements_left_fragments_df = chromatics.bedtools(
        'intersect -wa -wb', left_elements_df, left_fragments_df)
    right_elements_right_fragments_df = chromatics.bedtools(
        'intersect -wa -wb', right_elements_df, right_fragments_df)
    left_first_pairs_df = pd.merge(left_elements_left_fragments_df,
                                   right_elements_right_fragments_df,
                                   on=interaction_id_column)

    right_elements_left_fragments_df = chromatics.bedtools(
        'intersect -wa -wb', right_elements_df, left_fragments_df)
    left_elements_right_fragments_df = chromatics.bedtools(
        'intersect -wa -wb', left_elements_df, right_fragments_df)
    right_first_pairs_df = pd.merge(right_elements_left_fragments_df,
                                    left_elements_right_fragments_df,
                                    on=interaction_id_column)

    interaction_elements_df = pd.concat(
        [left_first_pairs_df, right_first_pairs_df], ignore_index=True)
    interaction_elements_df.drop_duplicates(
        [left_element_name_column, right_element_name_column], inplace=True)
    return interaction_elements_df
Example #2
0
def get_enrichment(a, b, c):
    a_with_c = chromatics.bedtools('intersect -sorted -u -f 1.0', a, c)
    b_with_c = chromatics.bedtools('intersect -sorted -u -f 1.0', b, c)

    ct = np.zeros([2, 2])
    ct[0, 0] = len(b) - len(b_with_c)
    ct[1, 0] = len(a) - len(a_with_c)
    ct[0, 1] = len(b_with_c)
    ct[1, 1] = len(a_with_c)

    return stats.fisher_exact(ct)[1]
Example #3
0
def get_enrichment(a, b, c):
    a_with_c = chromatics.bedtools('intersect -sorted -u -f 1.0', a, c)
    b_with_c = chromatics.bedtools('intersect -sorted -u -f 1.0', b, c)

    ct = np.zeros([2, 2])
    ct[0, 0] = len(b) - len(b_with_c)
    ct[1, 0] = len(a) - len(a_with_c)
    ct[0, 1] = len(b_with_c)
    ct[1, 1] = len(a_with_c)

    return stats.fisher_exact(ct)[1]
Example #4
0
def get_ordered_interaction_elements(interactions_df, interaction_id_column, left_fragment_columns, right_fragment_columns, left_elements_df, right_elements_df):
    left_element_name_column = left_elements_df.columns[-1]
    right_element_name_column = right_elements_df.columns[-1]
    assert left_element_name_column.endswith('_name') and right_element_name_column.endswith('_name')

    left_fragments_df = interactions_df[left_fragment_columns + [interaction_id_column]]
    left_elements_df = chromatics.bedtools('intersect -wa -wb', left_elements_df, left_fragments_df)

    right_fragments_df = interactions_df[right_fragment_columns + [interaction_id_column]]
    right_elements_df = chromatics.bedtools('intersect -wa -wb', right_elements_df, right_fragments_df)

    return pd.merge(left_elements_df, right_elements_df, on = interaction_id_column)
Example #5
0
def generate_average_signal_features(chunk_df, region, dataset):
    assert (chunk_df[region + '_end'] > chunk_df[region + '_start']).all()

    region_bed_columns = [
        '{}_{}'.format(region, _) for _ in chromatics.generic_bed_columns
    ]
    signal_df = chromatics.bedtools(
        'intersect -wa -wb',
        chunk_df[region_bed_columns].drop_duplicates(region + '_name'),
        dataset,
        right_names=chromatics.signal_bed_columns)

    group_columns = [
        '{}_{}'.format(region, _) for _ in ['name', 'start', 'end']
    ] + ['dataset']
    average_signal_df = signal_df.groupby(group_columns,
                                          sort=False,
                                          as_index=False).aggregate(
                                              {'signal_value': sum})
    average_signal_df['signal_value'] /= average_signal_df[
        region + '_end'] - average_signal_df[region + '_start']
    average_signal_df['dataset'] += ' ({})'.format(region)

    return average_signal_df.pivot_table(index=region + '_name',
                                         columns='dataset',
                                         values='signal_value')
Example #6
0
def get_interaction_elements(interactions_df, interaction_id_column, left_fragment_columns, right_fragment_columns, left_elements_df, right_elements_df):
    left_element_name_column = left_elements_df.columns[-1]
    right_element_name_column = right_elements_df.columns[-1]
    assert left_element_name_column.endswith('_name') and right_element_name_column.endswith('_name')

    # include interaction ids for re-merging pairs
    left_fragments_df = interactions_df[left_fragment_columns + [interaction_id_column]]
    right_fragments_df = interactions_df[right_fragment_columns + [interaction_id_column]]

    left_elements_left_fragments_df = chromatics.bedtools('intersect -wa -wb', left_elements_df, left_fragments_df)
    right_elements_right_fragments_df = chromatics.bedtools('intersect -wa -wb', right_elements_df, right_fragments_df)
    left_first_pairs_df = pd.merge(left_elements_left_fragments_df, right_elements_right_fragments_df, on = interaction_id_column)

    right_elements_left_fragments_df = chromatics.bedtools('intersect -wa -wb', right_elements_df, left_fragments_df)
    left_elements_right_fragments_df = chromatics.bedtools('intersect -wa -wb', left_elements_df, right_fragments_df)
    right_first_pairs_df = pd.merge(right_elements_left_fragments_df, left_elements_right_fragments_df, on = interaction_id_column)

    interaction_elements_df = pd.concat([left_first_pairs_df, right_first_pairs_df], ignore_index = True)
    interaction_elements_df.drop_duplicates([left_element_name_column, right_element_name_column], inplace = True)
    return interaction_elements_df
def generate_average_signal_features(chunk_df, region, dataset):
    assert (chunk_df[region + '_end'] > chunk_df[region + '_start']).all()

    region_bed_columns = ['{}_{}'.format(region, _) for _ in chromatics.generic_bed_columns]
    signal_df = chromatics.bedtools('intersect -wa -wb', chunk_df[region_bed_columns].drop_duplicates(region + '_name'), dataset, right_names = chromatics.signal_bed_columns)

    group_columns = ['{}_{}'.format(region, _) for _ in ['name', 'start', 'end']] + ['dataset']
    average_signal_df = signal_df.groupby(group_columns, sort = False, as_index = False).aggregate({'signal_value': sum})
    average_signal_df['signal_value'] /= average_signal_df[region + '_end'] - average_signal_df[region + '_start']
    average_signal_df['dataset'] += ' ({})'.format(region)

    return average_signal_df.pivot_table(index = region + '_name', columns = 'dataset', values = 'signal_value')
Example #8
0
def get_ordered_interaction_elements(interactions_df, interaction_id_column,
                                     left_fragment_columns,
                                     right_fragment_columns, left_elements_df,
                                     right_elements_df):
    left_element_name_column = left_elements_df.columns[-1]
    right_element_name_column = right_elements_df.columns[-1]
    assert left_element_name_column.endswith(
        '_name') and right_element_name_column.endswith('_name')

    left_fragments_df = interactions_df[left_fragment_columns +
                                        [interaction_id_column]]
    left_elements_df = chromatics.bedtools('intersect -wa -wb',
                                           left_elements_df, left_fragments_df)

    right_fragments_df = interactions_df[right_fragment_columns +
                                         [interaction_id_column]]
    right_elements_df = chromatics.bedtools('intersect -wa -wb',
                                            right_elements_df,
                                            right_fragments_df)

    return pd.merge(left_elements_df,
                    right_elements_df,
                    on=interaction_id_column)
Example #9
0
 def get_fragment_elements(fragments_df, elements_df):
     return set(chromatics.bedtools('intersect -wa -u', fragments_df, elements_df).iloc[:, -1])
Example #10
0
print(pairs_df['enhancer_distance_to_promoter'].describe())

print('\ndistance bins:')
print(pairs_df.groupby('label')['bin'].value_counts())

print('\nclasses:')
print(pairs_df['label'].value_counts())

print('\nenhancers per promoter (positives only):')
print(pairs_df.query('label == 1').groupby('promoter_name')['enhancer_name'].nunique().describe())

print('\npromoters per enhancer (positives only):')
print(pairs_df.query('label == 1').groupby('enhancer_name')['promoter_name'].nunique().describe(), '\n')

pairs_df['window_chrom'] = pairs_df['enhancer_chrom']
chromatics.add_names(pairs_df, chromatics.window_bed_columns, cell_line)

# add a few useful features here -- positive interactions already in the window
interactions_in_window_df = chromatics.bedtools('coverage -counts -F 1.0'.format(cell_line), pairs_df[chromatics.window_bed_columns], pairs_df.query('label == 1')[chromatics.window_bed_columns], left_names = chromatics.window_bed_columns, right_names = ['interactions_in_window']).iloc[:, -2:]
assert len(pairs_df) == len(interactions_in_window_df)
pairs_df = pd.merge(pairs_df, interactions_in_window_df, on = 'window_name')

# active genes skipped over by the loop
active_promoters_in_window_df = chromatics.bedtools('coverage -counts'.format(cell_line), pairs_df[chromatics.window_bed_columns], promoters_df, left_names = chromatics.window_bed_columns, right_names = ['active_promoters_in_window']).iloc[:, -2:]
assert len(pairs_df) == len(active_promoters_in_window_df)
pairs_df = pd.merge(pairs_df, active_promoters_in_window_df, on = 'window_name').drop('interaction_id', axis = 1)

# save
assert pairs_df.duplicated().sum() == 0
pairs_df.to_csv('pairs.csv', index = False)
expression_df = pd.concat([expression_df.drop(['variable', 'value'], axis = 1), expression_variables_df, expression_values_df], axis = 1)

# grab polyA+ genes in the cell since cytosol doesn't have replicates for all cell lines
expression_df = expression_df.query('rna_extract == "longPolyA" and localization == "cell"')
expression_df['rpkm1'] = pd.to_numeric(expression_df['rpkm1'])
expression_df['rpkm2'] = pd.to_numeric(expression_df['rpkm2'])
expression_df['idr'] = pd.to_numeric(expression_df['idr'], errors = 'coerce')

# drop inconsistently expressed genes and genes with low expression using cutoff from Ramskold et al., "An Abundance of Ubiquitously Expressed Genes Revealed by Tissue Transcriptome Sequence Data", PLoS Comp Bio 2009
print('{:.2%} of genes exceed IDR cutoff'.format(expression_df.eval('idr > 0.1').sum() / len(expression_df)))
print('expression cutoff: {} rpkm'.format(expression_cutoff))
expression_df = expression_df.query('idr <= @idr_cutoff and ((rpkm1 + rpkm2) / 2) > @expression_cutoff')

# combine tss and expression data
active_promoters_df = pd.concat([expression_df, final_tss_df], axis = 1, join = 'inner').reset_index()
active_promoters_df['gene_tss_dup'] = active_promoters_df['gene_tss']
active_promoters_df = active_promoters_df[['gene_chrom', 'gene_tss', 'gene_tss_dup', 'gene_id']]
chromatics.write_bed(active_promoters_df, 'tss.bed')

# find active chromhmm promoters
promoters_df = chromatics.bedtools('intersect -wa -u', chromhmm_promoters_df, active_promoters_df)

# optionally expand enhancer coordinates
promoters_df['promoter_start'] -= config['promoter_extension_size'] if 'promoter_extension_size' in config else 0
promoters_df['promoter_end'] += config['promoter_extension_size'] if 'promoter_extension_size' in config else 0

# save
assert promoters_df.duplicated().sum() == 0
chromatics.write_bed(promoters_df, 'promoters.bed')
print(promoters_df.eval('promoter_end - promoter_start').describe(), '\n')
Example #12
0
 def get_fragment_elements(fragments_df, elements_df):
     return set(
         chromatics.bedtools('intersect -wa -u', fragments_df,
                             elements_df).iloc[:, -1])