def generate_subsampled_datasets():
    '''
  This will generate subsampled tsvs from the MNIST dataset
  '''
    from clustergrammer import Network

    net = Network()
    # load full MNIST data with row labels
    net.load_file('processed_MNIST/large_files/MNIST_row_labels.txt')
    tmp_df = net.dat_to_df()
    df = tmp_df['mat']

    all_sample_nums = [20, 100, 200, 300, 400, 500, 1000]
    sample_repeats = 5

    for sample_num in all_sample_nums:

        df_subs = take_multiple_subsamples(df, sample_num, sample_repeats)

        for inst_subsample in df_subs:
            inst_df = df_subs[inst_subsample]

            inst_df = add_MNIST_cats(inst_df, row_cats=False)

            inst_filename = 'processed_MNIST/random_subsampling/MNIST_' \
                            +str(sample_num)+'x_random_subsample_'+str(inst_subsample)+'.txt'

            print(inst_df.shape)
            inst_df.to_csv(inst_filename, sep='\t')
def make_plex_matrix():
  '''
  Make a cell line matrix with plex rows and cell line columns.
  This will be used as a negative control that should show worsening correlation
  as data is normalized/filtered.
  '''
  import numpy as np
  import pandas as pd
  from clustergrammer import Network

  # load cl_info
  net = Network()
  cl_info = net.load_json_to_dict('../cell_line_info/cell_line_info_dict.json')

  # load cell line expression
  net.load_file('../CCLE_gene_expression/CCLE_NSCLC_all_genes.txt')
  tmp_df = net.dat_to_df()
  df = tmp_df['mat']

  cols = df.columns.tolist()

  rows = range(9)
  rows = [i+1 for i in rows]
  print(rows)

  mat = np.zeros((len(rows), len(cols)))

  for inst_col in cols:

    for inst_cl in cl_info:

      if inst_col in inst_cl:
        inst_plex = int(cl_info[inst_cl]['Plex'])

        if inst_plex != -1:
          # print(inst_col + ' in ' + inst_cl + ': ' + str(inst_plex))

          row_index = rows.index(inst_plex)
          col_index = cols.index(inst_col)

          mat[row_index, col_index] = 1


  df_plex = pd.DataFrame(data=mat, columns=cols, index=rows)

  filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' + \
            'exp-plex.txt'
  df_plex.to_csv(filename, sep='\t')
Beispiel #3
0
def make_plex_matrix():
    '''
  Make a cell line matrix with plex rows and cell line columns.
  This will be used as a negative control that should show worsening correlation
  as data is normalized/filtered.
  '''
    import numpy as np
    import pandas as pd
    from clustergrammer import Network

    # load cl_info
    net = Network()
    cl_info = net.load_json_to_dict(
        '../cell_line_info/cell_line_info_dict.json')

    # load cell line expression
    net.load_file('../CCLE_gene_expression/CCLE_NSCLC_all_genes.txt')
    tmp_df = net.dat_to_df()
    df = tmp_df['mat']

    cols = df.columns.tolist()

    rows = range(9)
    rows = [i + 1 for i in rows]
    print(rows)

    mat = np.zeros((len(rows), len(cols)))

    for inst_col in cols:

        for inst_cl in cl_info:

            if inst_col in inst_cl:
                inst_plex = int(cl_info[inst_cl]['Plex'])

                if inst_plex != -1:
                    # print(inst_col + ' in ' + inst_cl + ': ' + str(inst_plex))

                    row_index = rows.index(inst_plex)
                    col_index = cols.index(inst_col)

                    mat[row_index, col_index] = 1

    df_plex = pd.DataFrame(data=mat, columns=cols, index=rows)

    filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' + \
              'exp-plex.txt'
    df_plex.to_csv(filename, sep='\t')
Beispiel #4
0
def main():
  import numpy as np
  import pandas as pd
  from clustergrammer import Network

  rtk_list = load_rtks()

  net = Network()
  net.load_file('txt/tmp_cst_drug_treat_cl.txt')
  df_dict = net.dat_to_df()

  inst_df = df_dict['mat']

  inst_df = inst_df.ix[rtk_list]

  inst_df.to_csv('txt/RTK_exp_in_drug_treat_cl.txt', sep='\t')
def equal_digit_sampling_MNIST():
    '''
  Sample N instances of each digit from the MNIST dataset
  '''

    from clustergrammer import Network
    net = Network()
    net.load_file('processed_MNIST/large_files/MNIST_row_labels.txt')
    tmp_df = net.dat_to_df()
    df = tmp_df['mat']

    print(df.shape)

    label_dict = get_label_dict()

    num_sample = 30

    # only keep 20 instances of each numbers
    ###########################################
    keep_cols = []

    for inst_digit in label_dict:
        tmp_name = label_dict[inst_digit]

        # select 20 instances of each digit
        for i in range(num_sample):
            inst_name = tmp_name + '-' + str(i)
            keep_cols.append(inst_name)

    # grab subset of numbers
    df = df[keep_cols]

    df = add_MNIST_cats()

    print('shape after processing')
    print(df.shape)

    df.to_csv('processed_MNIST/MNIST_' + str(num_sample) + 'x_original.txt',
              sep='\t')
def make_json_from_tsv(name):
  '''
  make a clustergrammer json from a tsv file
  '''
  from clustergrammer import Network

  print('\n' + name)

  net = Network()

  filename = 'txt/'+ name + '.txt'

  net.load_file(filename)

  df = net.dat_to_df()

  net.swap_nan_for_zero()

  # zscore first to get the columns distributions to be similar
  net.normalize(axis='col', norm_type='zscore', keep_orig=True)

  # filter the rows to keep the perts with the largest normalizes values
  net.filter_N_top('row', 1000)

  num_rows = net.dat['mat'].shape[0]
  num_cols = net.dat['mat'].shape[1]

  print('num_rows ' + str(num_rows))
  print('num_cols ' + str(num_cols))

  if num_cols < 50 or num_rows < 1000:

    views = ['N_row_sum']
    net.make_clust(dist_type='cos', views=views)
    export_filename = 'json/' + name + '.json'
    net.write_json_to_file('viz', export_filename)

  else:
    print('did not cluster, too many columns ')
def reproduce_Mark_correlation_matrix():
  import pandas as pd
  from scipy.spatial.distance import squareform
  from clustergrammer import Network
  from copy import deepcopy

  dist_vect = calc_custom_dist(data_type='ptm_none', dist_metric='correlation',
                              pairwise='True')


  dist_mat = squareform(dist_vect)

  # make similarity matrix
  dist_mat = 1 - dist_mat

  net = Network()

  data_type = 'ptm_none'

  filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' + \
             data_type + '.txt'

  # load file and export dataframe
  net = deepcopy(Network())
  net.load_file(filename)
  net.swap_nan_for_zero()
  tmp_df = net.dat_to_df()
  df = tmp_df['mat']

  cols = df.columns.tolist()
  rows = cols

  mark_df = pd.DataFrame(data=dist_mat, columns=cols, index=rows)

  save_filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' \
             + 'Mark_corr_sim_mat' + '.txt'
  mark_df.to_csv(save_filename, sep='\t')
def reproduce_Mark_correlation_matrix():
    import pandas as pd
    from scipy.spatial.distance import squareform
    from clustergrammer import Network
    from copy import deepcopy

    dist_vect = calc_custom_dist(data_type='ptm_none',
                                 dist_metric='correlation',
                                 pairwise='True')

    dist_mat = squareform(dist_vect)

    # make similarity matrix
    dist_mat = 1 - dist_mat

    net = Network()

    data_type = 'ptm_none'

    filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' + \
               data_type + '.txt'

    # load file and export dataframe
    net = deepcopy(Network())
    net.load_file(filename)
    net.swap_nan_for_zero()
    tmp_df = net.dat_to_df()
    df = tmp_df['mat']

    cols = df.columns.tolist()
    rows = cols

    mark_df = pd.DataFrame(data=dist_mat, columns=cols, index=rows)

    save_filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' \
               + 'Mark_corr_sim_mat' + '.txt'
    mark_df.to_csv(save_filename, sep='\t', na_rep='nan')
Beispiel #9
0
def clust_from_response(response_list):
    from clustergrammer import Network
    import scipy
    import json
    import pandas as pd
    import math
    from copy import deepcopy

    # print('----------------------')
    # print('enrichr_clust_from_response')
    # print('----------------------')

    ini_enr = transfer_to_enr_dict(response_list)

    enr = []
    scores = {}
    score_types = ['combined_score', 'pval', 'zscore']

    for score_type in score_types:
        scores[score_type] = pd.Series()

    for inst_enr in ini_enr:
        if inst_enr['combined_score'] > 0:

            # make series of enriched terms with scores
            for score_type in score_types:

                # collect the scores of the enriched terms
                if score_type == 'combined_score':
                    scores[score_type][inst_enr['name']] = inst_enr[score_type]
                if score_type == 'pval':
                    scores[score_type][inst_enr['name']] = -math.log(
                        inst_enr[score_type])
                if score_type == 'zscore':
                    scores[score_type][
                        inst_enr['name']] = -inst_enr[score_type]

            # keep enrichement values
            enr.append(inst_enr)

    # sort and normalize the scores
    for score_type in score_types:
        scores[score_type] = scores[score_type] / scores[score_type].max()
        scores[score_type].sort_values(ascending=False)

    number_of_enriched_terms = len(scores['combined_score'])

    enr_score_types = ['combined_score', 'pval', 'zscore']

    if number_of_enriched_terms < 10:
        num_dict = {'ten': 10}
    elif number_of_enriched_terms < 20:
        num_dict = {'ten': 10, 'twenty': 20}
    else:
        num_dict = {'ten': 10, 'twenty': 20, 'thirty': 30}

    # gather lists of top scores
    top_terms = {}
    for enr_type in enr_score_types:
        top_terms[enr_type] = {}
        for num_terms in list(num_dict.keys()):
            inst_num = num_dict[num_terms]
            top_terms[enr_type][num_terms] = scores[enr_type].index.tolist(
            )[:inst_num]

    # gather the terms that should be kept - they are at the top of the score list
    keep_terms = []
    for inst_enr_score in top_terms:
        for tmp_num in list(num_dict.keys()):
            keep_terms.extend(top_terms[inst_enr_score][tmp_num])

    keep_terms = list(set(keep_terms))

    # keep enriched terms that are at the top 10 based on at least one score
    keep_enr = []
    for inst_enr in enr:
        if inst_enr['name'] in keep_terms:
            keep_enr.append(inst_enr)

    # fill in full matrix
    #######################

    # genes
    row_node_names = []
    # enriched terms
    col_node_names = []

    # gather information from the list of enriched terms
    for inst_enr in keep_enr:
        col_node_names.append(inst_enr['name'])
        row_node_names.extend(inst_enr['int_genes'])

    row_node_names = sorted(list(set(row_node_names)))

    net = Network()
    net.dat['nodes']['row'] = row_node_names
    net.dat['nodes']['col'] = col_node_names
    net.dat['mat'] = scipy.zeros([len(row_node_names), len(col_node_names)])

    for inst_enr in keep_enr:

        inst_term = inst_enr['name']
        col_index = col_node_names.index(inst_term)

        # use combined score for full matrix - will not be seen in viz
        tmp_score = scores['combined_score'][inst_term]
        net.dat['node_info']['col']['value'].append(tmp_score)

        for inst_gene in inst_enr['int_genes']:
            row_index = row_node_names.index(inst_gene)

            # save association
            net.dat['mat'][row_index, col_index] = 1

    # cluster full matrix
    #############################
    # do not make multiple views
    views = ['']

    if len(net.dat['nodes']['row']) > 1:
        net.make_clust(dist_type='jaccard', views=views, dendro=False)
    else:
        net.make_clust(dist_type='jaccard',
                       views=views,
                       dendro=False,
                       run_clustering=False)

    # get dataframe from full matrix
    df = net.dat_to_df()

    for score_type in score_types:

        for num_terms in num_dict:

            inst_df = deepcopy(df)
            inst_net = deepcopy(Network())

            inst_df['mat'] = inst_df['mat'][top_terms[score_type][num_terms]]

            # load back into net
            inst_net.df_to_dat(inst_df)

            # make views
            if len(net.dat['nodes']['row']) > 1:
                inst_net.make_clust(dist_type='jaccard',
                                    views=['N_row_sum'],
                                    dendro=False)
            else:
                inst_net.make_clust(dist_type='jaccard',
                                    views=['N_row_sum'],
                                    dendro=False,
                                    run_clustering=False)

            inst_views = inst_net.viz['views']

            # add score_type to views
            for inst_view in inst_views:

                inst_view['N_col_sum'] = num_dict[num_terms]

                inst_view['enr_score_type'] = score_type

                # add values to col_nodes and order according to rank
                for inst_col in inst_view['nodes']['col_nodes']:

                    inst_col['rank'] = len(
                        top_terms[score_type]
                        [num_terms]) - top_terms[score_type][num_terms].index(
                            inst_col['name'])

                    inst_name = inst_col['name']
                    inst_col['value'] = scores[score_type][inst_name]

            # add views to main network
            net.viz['views'].extend(inst_views)

    return net
def clust_from_response(response_list):
  from clustergrammer import Network
  import scipy
  import json
  import pandas as pd
  import math
  from copy import deepcopy

  print('----------------------')
  print('enrichr_clust_from_response')
  print('----------------------')

  ini_enr = transfer_to_enr_dict( response_list )

  enr = []
  scores = {}
  score_types = ['combined_score','pval','zscore']

  for score_type in score_types:
    scores[score_type] = pd.Series()

  for inst_enr in ini_enr:
    if inst_enr['combined_score'] > 0:

      # make series of enriched terms with scores
      for score_type in score_types:

        # collect the scores of the enriched terms
        if score_type == 'combined_score':
          scores[score_type][inst_enr['name']] = inst_enr[score_type]
        if score_type == 'pval':
          scores[score_type][inst_enr['name']] = -math.log(inst_enr[score_type])
        if score_type == 'zscore':
          scores[score_type][inst_enr['name']] = -inst_enr[score_type]

      # keep enrichement values
      enr.append(inst_enr)

  # sort and normalize the scores
  for score_type in score_types:
    scores[score_type] = scores[score_type]/scores[score_type].max()
    scores[score_type].sort(ascending=False)

  number_of_enriched_terms = len(scores['combined_score'])

  enr_score_types = ['combined_score','pval','zscore']

  if number_of_enriched_terms <10:
    num_dict = {'ten':10}
  elif number_of_enriched_terms <20:
    num_dict = {'ten':10, 'twenty':20}
  else:
    num_dict = {'ten':10, 'twenty':20, 'thirty':30}

  # gather lists of top scores
  top_terms = {}
  for enr_type in enr_score_types:
    top_terms[enr_type] = {}
    for num_terms in num_dict.keys():
      inst_num = num_dict[num_terms]
      top_terms[enr_type][num_terms] = scores[enr_type].index.tolist()[: inst_num]

  # gather the terms that should be kept - they are at the top of the score list
  keep_terms = []
  for inst_enr_score in top_terms:
    for tmp_num in num_dict.keys():
      keep_terms.extend( top_terms[inst_enr_score][tmp_num] )

  keep_terms = list(set(keep_terms))

  # keep enriched terms that are at the top 10 based on at least one score
  keep_enr = []
  for inst_enr in enr:
    if inst_enr['name'] in keep_terms:
      keep_enr.append(inst_enr)


  # fill in full matrix
  #######################

  # genes
  row_node_names = []
  # enriched terms
  col_node_names = []

  # gather information from the list of enriched terms
  for inst_enr in keep_enr:
    col_node_names.append(inst_enr['name'])
    row_node_names.extend(inst_enr['int_genes'])

  row_node_names = sorted(list(set(row_node_names)))

  net = Network()
  net.dat['nodes']['row'] = row_node_names
  net.dat['nodes']['col'] = col_node_names
  net.dat['mat'] = scipy.zeros([len(row_node_names),len(col_node_names)])

  for inst_enr in keep_enr:

    inst_term = inst_enr['name']
    col_index = col_node_names.index(inst_term)

    # use combined score for full matrix - will not be seen in viz
    tmp_score = scores['combined_score'][inst_term]
    net.dat['node_info']['col']['value'].append(tmp_score)

    for inst_gene in inst_enr['int_genes']:
      row_index = row_node_names.index(inst_gene)

      # save association
      net.dat['mat'][row_index, col_index] = 1

  # cluster full matrix
  #############################
  # do not make multiple views
  views = ['']

  if len(net.dat['nodes']['row']) > 1:
    net.make_clust(dist_type='jaccard', views=views, dendro=False)
  else:
    net.make_clust(dist_type='jaccard', views=views, dendro=False, run_clustering=False)

  # get dataframe from full matrix
  df = net.dat_to_df()

  for score_type in score_types:

    for num_terms in num_dict:

      inst_df = deepcopy(df)
      inst_net = deepcopy(Network())

      inst_df['mat'] = inst_df['mat'][top_terms[score_type][num_terms]]

      # load back into net
      inst_net.df_to_dat(inst_df)

      # make views
      if len(net.dat['nodes']['row']) > 1:
        inst_net.make_clust(dist_type='jaccard', views=['N_row_sum'], dendro=False)
      else:
        inst_net.make_clust(dist_type='jaccard', views=['N_row_sum'], dendro=False, run_clustering = False)

      inst_views = inst_net.viz['views']

      # add score_type to views
      for inst_view in inst_views:

        inst_view['N_col_sum'] = num_dict[num_terms]

        inst_view['enr_score_type'] = score_type

        # add values to col_nodes and order according to rank
        for inst_col in inst_view['nodes']['col_nodes']:

          inst_col['rank'] = len(top_terms[score_type][num_terms]) - top_terms[score_type][num_terms].index(inst_col['name'])

          inst_name = inst_col['name']
          inst_col['value'] = scores[score_type][inst_name]

      # add views to main network
      net.viz['views'].extend(inst_views)

  return net
Beispiel #11
0
from clustergrammer import Network

net = Network()
net.load_file('txt/rc_two_cats.txt')

# print(net.dat['nodes']['row'])

cat_list = []

for inst_gene in net.dat['nodes']['row']:
    inst_tuple = [inst_gene]
    cat_list.append(inst_tuple)

df = net.dat_to_df()

all_rows = df['mat'].index.tolist()

print(all_rows)

new_rows = []
for inst_row in all_rows:
    new_rows.append(list(inst_row))

print('\n\n\n')
print(new_rows)

for inst_row in new_rows:
    inst_row.append('something')

print('\n\n\n')
print(new_rows)