Example #1
0
    matrices = both.groupby(['domain', 'new', 'original']).apply(len).unstack()
    return matrices


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("original_data_table", type=argparse.FileType('r'))
    parser.add_argument("new_data_table", type=argparse.FileType('r'))
    parser.add_argument("domains", nargs="+", type=str)
    args = parser.parse_args()

    original_data = pd.read_csv(args.original_data_table, delimiter='\t')
    original_data.columns = ['gene'] + args.domains
    original_data = stack_with_source(original_data, 'original')

    new_data = pd.read_csv(args.new_data_table, delimiter='\t')
    new_data.columns = ['gene'] + args.domains
    new_data = stack_with_source(new_data, 'new')

    matrices = join(new_data, original_data)
    for domain in args.domains:
        domain_matrix = matrices.loc[domain].dropna(how='all', axis=1)
        normalised_matrix = normalise_along_axis(domain_matrix, 0)
        plot_heatmap(normalised_matrix)
        print normalised_matrix.T.fillna(0).to_csv(sep='\t')

    try:
        input("Press ENTER to close the windows")
    except:
        pass
  

if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  parser.add_argument("original_data_table", type=argparse.FileType('r'))
  parser.add_argument("new_data_table", type=argparse.FileType('r'))
  parser.add_argument("domains", nargs="+", type=str)
  args = parser.parse_args()

  original_data = pd.read_csv(args.original_data_table,
                              delimiter='\t')
  original_data.columns = ['gene'] + args.domains
  original_data = stack_with_source(original_data, 'original')
  
  new_data = pd.read_csv(args.new_data_table,
                         delimiter='\t')
  new_data.columns = ['gene'] + args.domains
  new_data = stack_with_source(new_data, 'new')
  
  matrices = join(new_data, original_data)
  for domain in args.domains:
    domain_matrix = matrices.loc[domain].dropna(how='all', axis=1)
    normalised_matrix = normalise_along_axis(domain_matrix, 0)
    plot_heatmap(normalised_matrix)
    print normalised_matrix.T.fillna(0).to_csv(sep='\t')

  try:
    input("Press ENTER to close the windows")
  except:
    pass
Example #3
0
from compare_lots_of_clusters import plot_heatmap, normalise_along_axis

if __name__ == '__main__':
    original_data = pd.read_csv("original_seven_genomes_domains.table.log",
                                delimiter='\t')
    original_data.columns = ['gene', 'CIDRa', 'DBLa']

    plot_heatmap(original_data.groupby(['CIDRa', 'DBLa']).apply(len).unstack())
from compare_lots_of_clusters import plot_heatmap, normalise_along_axis

if __name__ == '__main__':
  original_data = pd.read_csv("original_seven_genomes_domains.table.log", delimiter='\t')
  original_data.columns = ['gene', 'CIDRa', 'DBLa']
  
  plot_heatmap(original_data.groupby(['CIDRa', 'DBLa']).apply(len).unstack())
def get_cluster_sizes_and_order(classification_data):
    logging.debug("Counting cluster sizes")
    data = classification_data.groupby('subdomain').aggregate(len)
    data = data.reset_index()
    data.columns = ['subdomain', 'count']
    return data.sort('subdomain')


if __name__ == '__main__':
    logging.basicConfig(format='[%(asctime)s] %(message)s',
                        level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('classification_summary', type=argparse.FileType('r'))
    parser.add_argument('distance_matrix', type=argparse.FileType('r'))
    args = parser.parse_args()

    logging.debug("Loading data")
    classification_data = load_classification_data(args.classification_summary)
    distance_matrix = pd.read_csv(args.distance_matrix, delimiter='\t')

    classification_data = sort_by_subdomain(classification_data)
    sample_order = classification_data['name']
    cluster_counts = get_cluster_sizes_and_order(classification_data)

    relevant_columns = get_matrix_columns(distance_matrix, sample_order)
    relevant_data = get_relevant_data(relevant_columns, sample_order)
    cluster_counts.to_csv(sys.stdout, sep='\t', index=False)
    logging.debug("Presenting graph")
    plot_heatmap(relevant_data, block=True)
  return data.loc[sample_order, :]

def get_cluster_sizes_and_order(classification_data):
  logging.debug("Counting cluster sizes")
  data = classification_data.groupby('subdomain').aggregate(len)
  data = data.reset_index()
  data.columns = ['subdomain', 'count']
  return data.sort('subdomain')

if __name__ == '__main__':
  logging.basicConfig(format='[%(asctime)s] %(message)s', level=logging.DEBUG)

  parser = argparse.ArgumentParser()
  parser.add_argument('classification_summary', type=argparse.FileType('r'))
  parser.add_argument('distance_matrix', type=argparse.FileType('r'))
  args = parser.parse_args()

  logging.debug("Loading data")
  classification_data = load_classification_data(args.classification_summary)
  distance_matrix = pd.read_csv(args.distance_matrix, delimiter='\t')

  classification_data = sort_by_subdomain(classification_data)
  sample_order = classification_data['name']
  cluster_counts = get_cluster_sizes_and_order(classification_data)

  relevant_columns = get_matrix_columns(distance_matrix, sample_order)
  relevant_data = get_relevant_data(relevant_columns, sample_order)
  cluster_counts.to_csv(sys.stdout, sep='\t', index=False)
  logging.debug("Presenting graph")
  plot_heatmap(relevant_data, block=True)