Ejemplo n.º 1
0
def make_clust(net, dist_type='cosine', run_clustering=True, dendro=True, 
                          requested_views=['pct_row_sum', 'N_row_sum'],
                          linkage_type='average', sim_mat=False, filter_sim=0.1,
                          calc_cat_pval=False):
  ''' 
  This will calculate multiple views of a clustergram by filtering the 
  data and clustering after each filtering. This filtering will keep the top 
  N rows based on some quantity (sum, num-non-zero, etc). 
  '''
  from copy import deepcopy
  import calc_clust, run_filter, make_views, make_sim_mat, cat_pval
  import scipy

  df = net.dat_to_df()

  threshold = 0.0001
  df = run_filter.df_filter_row_sum(df, threshold)
  df = run_filter.df_filter_col_sum(df, threshold)

  # calculate initial view with no row filtering
  net.df_to_dat(df)

  inst_dm = calc_clust.cluster_row_and_col(net, dist_type=dist_type, 
                                linkage_type=linkage_type, 
                                run_clustering=run_clustering, 
                                dendro=dendro, ignore_cat=False, 
                                calc_cat_pval=calc_cat_pval)

  all_views = []
  send_df = deepcopy(df)

  if 'N_row_sum' in requested_views:
    all_views = make_views.N_rows(net, send_df, all_views,
                                  dist_type=dist_type, rank_type='sum')

  if 'N_row_var' in requested_views:
    all_views = make_views.N_rows(net, send_df, all_views,
                                  dist_type=dist_type, rank_type='var')

  if 'pct_row_sum' in requested_views:
    all_views = make_views.pct_rows(net, send_df, all_views,
                                    dist_type=dist_type, rank_type='sum')

  if 'pct_row_var' in requested_views:
    all_views = make_views.pct_rows(net, send_df, all_views,
                                    dist_type=dist_type, rank_type='var')

  if sim_mat is True:
    print('make similarity matrices of rows and columns, add to viz data structure')
    sim_net = make_sim_mat.main(net, inst_dm, filter_sim)

    net.sim = {}
    net.sim['row'] = sim_net['row'].viz
    net.sim['col'] = sim_net['col'].viz

  net.viz['views'] = all_views
Ejemplo n.º 2
0
 def filter_sum(self, inst_rc, threshold, take_abs=True):
     import run_filter
     inst_df = self.dat_to_df()
     if inst_rc == 'row':
         inst_df = run_filter.df_filter_row_sum(inst_df, threshold,
                                                take_abs)
     elif inst_rc == 'col':
         inst_df = run_filter.df_filter_col_sum(inst_df, threshold,
                                                take_abs)
     self.df_to_dat(inst_df)
Ejemplo n.º 3
0
def pct_rows(net, df, all_views, dist_type, rank_type):
    from __init__ import Network
    from copy import deepcopy
    import numpy as np
    import calc_clust, run_filter

    copy_net = deepcopy(net)

    if len(net.dat['node_info']['col']['cat']) > 0:
        cat_key_col = {}
        for i in range(len(net.dat['nodes']['col'])):
            cat_key_col[net.dat['nodes']['col'][i]] = \
                net.dat['node_info']['col']['cat'][i]

    all_filt = range(10)
    all_filt = [i / float(10) for i in all_filt]

    mat = deepcopy(df['mat'])
    sum_row = np.sum(mat, axis=1)
    max_sum = max(sum_row)

    for inst_filt in all_filt:

        cutoff = inst_filt * max_sum
        copy_net = deepcopy(net)
        inst_df = deepcopy(df)
        inst_df = run_filter.df_filter_row_sum(inst_df, cutoff, take_abs=False)

        tmp_net = deepcopy(Network())
        tmp_net.df_to_dat(inst_df)

        try:
            try:
                calc_clust.cluster_row_and_col(tmp_net,
                                               dist_type=dist_type,
                                               run_clustering=True)

            except:
                calc_clust.cluster_row_and_col(tmp_net,
                                               dist_type=dist_type,
                                               run_clustering=False)

            inst_view = {}
            inst_view['pct_row_' + rank_type] = inst_filt
            inst_view['dist'] = 'cos'
            inst_view['nodes'] = {}
            inst_view['nodes']['row_nodes'] = tmp_net.viz['row_nodes']
            inst_view['nodes']['col_nodes'] = tmp_net.viz['col_nodes']

            all_views.append(inst_view)

        except:
            pass

    return all_views
Ejemplo n.º 4
0
 def filter_sum(self, inst_rc, threshold, take_abs=True):
   ''' 
   Filter a network's rows or columns based on the sum across rows or columns 
   Works on the network object 
   '''
   import run_filter
   inst_df = self.dat_to_df()
   if inst_rc == 'row':
     inst_df = run_filter.df_filter_row_sum(inst_df, threshold, take_abs)
   elif inst_rc == 'col':
     inst_df = run_filter.df_filter_col_sum(inst_df, threshold, take_abs)
   self.df_to_dat(inst_df)
Ejemplo n.º 5
0
def pct_rows(net, df, all_views, dist_type, rank_type):
  from __init__ import Network
  from copy import deepcopy
  import numpy as np
  import calc_clust, run_filter

  copy_net = deepcopy(net)

  if len(net.dat['node_info']['col']['cat']) > 0:
    cat_key_col = {}
    for i in range(len(net.dat['nodes']['col'])):
      cat_key_col[net.dat['nodes']['col'][i]] = \
          net.dat['node_info']['col']['cat'][i]

  all_filt = range(10)
  all_filt = [i / float(10) for i in all_filt]

  mat = deepcopy(df['mat'])
  sum_row = np.sum(mat, axis=1)
  max_sum = max(sum_row)

  for inst_filt in all_filt:

    cutoff = inst_filt * max_sum
    copy_net = deepcopy(net)
    inst_df = deepcopy(df)
    inst_df = run_filter.df_filter_row_sum(inst_df, cutoff, take_abs=False)

    tmp_net = deepcopy(Network())
    tmp_net.df_to_dat(inst_df)

    try:
      try:
        calc_clust.cluster_row_and_col(tmp_net, dist_type=dist_type, 
                                       run_clustering=True)

      except:
        calc_clust.cluster_row_and_col(tmp_net, dist_type=dist_type, 
                                       run_clustering=False)

      inst_view = {}
      inst_view['pct_row_' + rank_type] = inst_filt
      inst_view['dist'] = 'cos'
      inst_view['nodes'] = {}
      inst_view['nodes']['row_nodes'] = tmp_net.viz['row_nodes']
      inst_view['nodes']['col_nodes'] = tmp_net.viz['col_nodes']

      all_views.append(inst_view)

    except:
      pass

  return all_views  
Ejemplo n.º 6
0
 def filter_sum(self, inst_rc, threshold, take_abs=True):
     '''
 Filter a network's rows or columns based on the sum across rows or columns
 Works on the network object
 '''
     import run_filter
     inst_df = self.dat_to_df()
     if inst_rc == 'row':
         inst_df = run_filter.df_filter_row_sum(inst_df, threshold,
                                                take_abs)
     elif inst_rc == 'col':
         inst_df = run_filter.df_filter_col_sum(inst_df, threshold,
                                                take_abs)
     self.df_to_dat(inst_df)
Ejemplo n.º 7
0
def make_clust(net,
               dist_type='cosine',
               run_clustering=True,
               dendro=True,
               requested_views=['pct_row_sum', 'N_row_sum'],
               linkage_type='average',
               sim_mat=False,
               filter_sim=0.1,
               calc_cat_pval=False,
               sim_mat_views=['N_row_sum'],
               run_enrichr=None):
    '''
  This will calculate multiple views of a clustergram by filtering the
  data and clustering after each filtering. This filtering will keep the top
  N rows based on some quantity (sum, num-non-zero, etc).
  '''
    from copy import deepcopy
    import scipy
    import calc_clust, run_filter, make_views, make_sim_mat, cat_pval
    import enrichr_functions as enr_fun

    df = net.dat_to_df()

    threshold = 0.0001
    df = run_filter.df_filter_row_sum(df, threshold)
    df = run_filter.df_filter_col_sum(df, threshold)

    if run_enrichr is not None:
        df = enr_fun.add_enrichr_cats(df, 'row', run_enrichr)

    # calculate initial view with no row filtering
    net.df_to_dat(df)

    inst_dm = calc_clust.cluster_row_and_col(net,
                                             dist_type=dist_type,
                                             linkage_type=linkage_type,
                                             run_clustering=run_clustering,
                                             dendro=dendro,
                                             ignore_cat=False,
                                             calc_cat_pval=calc_cat_pval)

    all_views = []
    send_df = deepcopy(df)

    if 'N_row_sum' in requested_views:
        all_views = make_views.N_rows(net,
                                      send_df,
                                      all_views,
                                      dist_type=dist_type,
                                      rank_type='sum')

    if 'N_row_var' in requested_views:
        all_views = make_views.N_rows(net,
                                      send_df,
                                      all_views,
                                      dist_type=dist_type,
                                      rank_type='var')

    if 'pct_row_sum' in requested_views:
        all_views = make_views.pct_rows(net,
                                        send_df,
                                        all_views,
                                        dist_type=dist_type,
                                        rank_type='sum')

    if 'pct_row_var' in requested_views:
        all_views = make_views.pct_rows(net,
                                        send_df,
                                        all_views,
                                        dist_type=dist_type,
                                        rank_type='var')

    if sim_mat is True:
        print(
            'make similarity matrices of rows and columns, add to viz data structure'
        )
        sim_net = make_sim_mat.main(net, inst_dm, filter_sim, sim_mat_views)

        net.sim = {}
        net.sim['row'] = sim_net['row'].viz
        net.sim['col'] = sim_net['col'].viz

    net.viz['views'] = all_views
Ejemplo n.º 8
0
def calc_cat_clust_order(net, inst_rc):
  '''
  cluster category subset of data
  '''
  from __init__ import Network
  from copy import deepcopy
  import calc_clust, run_filter

  inst_keys = net.dat['node_info'][inst_rc].keys()
  all_cats = [x for x in inst_keys if 'cat-' in x]

  if len(all_cats) > 0:

    for inst_name_cat in all_cats:

      tmp_name = 'dict_' + inst_name_cat.replace('-', '_')
      dict_cat = net.dat['node_info'][inst_rc][tmp_name]

      all_cats = sorted(dict_cat.keys())

      # this is the ordering of the columns based on their category, not
      # including their clustering ordering within category
      all_cat_orders = []
      tmp_names_list = []
      for inst_cat in all_cats:

        inst_nodes = dict_cat[inst_cat]

        tmp_names_list.extend(inst_nodes)

        cat_net = deepcopy(Network())

        cat_net.dat['mat'] = deepcopy(net.dat['mat'])
        cat_net.dat['nodes'] = deepcopy(net.dat['nodes'])

        cat_df = cat_net.dat_to_df()

        sub_df = {}
        if inst_rc == 'col':
          sub_df['mat'] = cat_df['mat'][inst_nodes]
        elif inst_rc == 'row':
          # need to transpose df
          cat_df['mat'] = cat_df['mat'].transpose()
          sub_df['mat'] = cat_df['mat'][inst_nodes]
          sub_df['mat'] = sub_df['mat'].transpose()

        # filter matrix before clustering
        ###################################
        threshold = 0.0001
        sub_df = run_filter.df_filter_row_sum(sub_df, threshold)
        sub_df = run_filter.df_filter_col_sum(sub_df, threshold)

        # load back to dat
        cat_net.df_to_dat(sub_df)

        cat_mat_shape = cat_net.dat['mat'].shape

        try:
          if cat_mat_shape[0]>1 and cat_mat_shape[1] > 1:

            calc_clust.cluster_row_and_col(cat_net, 'cos')
            inst_cat_order = cat_net.dat['node_info'][inst_rc]['clust']
          else:
            inst_cat_order = range(len(cat_net.dat['nodes'][inst_rc]))

        except:
          inst_cat_order = range(len(cat_net.dat['nodes'][inst_rc]))


        prev_order_len = len(all_cat_orders)

        # add prev order length to the current order number
        inst_cat_order = [i + prev_order_len for i in inst_cat_order]
        all_cat_orders.extend(inst_cat_order)

      names_clust_list = [x for (y, x) in sorted(zip(all_cat_orders,
                          tmp_names_list))]

      # calc category-cluster order
      final_order = []

      for i in range(len(net.dat['nodes'][inst_rc])):

        inst_node_name = net.dat['nodes'][inst_rc][i]
        inst_node_num = names_clust_list.index(inst_node_name)
        final_order.append(inst_node_num)

      net.dat['node_info'][inst_rc][inst_name_cat.replace('-', '_') +
                                     '_index'] = final_order