matrices = both.groupby(['domain', 'new', 'original']).apply(len).unstack() return matrices if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("original_data_table", type=argparse.FileType('r')) parser.add_argument("new_data_table", type=argparse.FileType('r')) parser.add_argument("domains", nargs="+", type=str) args = parser.parse_args() original_data = pd.read_csv(args.original_data_table, delimiter='\t') original_data.columns = ['gene'] + args.domains original_data = stack_with_source(original_data, 'original') new_data = pd.read_csv(args.new_data_table, delimiter='\t') new_data.columns = ['gene'] + args.domains new_data = stack_with_source(new_data, 'new') matrices = join(new_data, original_data) for domain in args.domains: domain_matrix = matrices.loc[domain].dropna(how='all', axis=1) normalised_matrix = normalise_along_axis(domain_matrix, 0) plot_heatmap(normalised_matrix) print normalised_matrix.T.fillna(0).to_csv(sep='\t') try: input("Press ENTER to close the windows") except: pass
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("original_data_table", type=argparse.FileType('r')) parser.add_argument("new_data_table", type=argparse.FileType('r')) parser.add_argument("domains", nargs="+", type=str) args = parser.parse_args() original_data = pd.read_csv(args.original_data_table, delimiter='\t') original_data.columns = ['gene'] + args.domains original_data = stack_with_source(original_data, 'original') new_data = pd.read_csv(args.new_data_table, delimiter='\t') new_data.columns = ['gene'] + args.domains new_data = stack_with_source(new_data, 'new') matrices = join(new_data, original_data) for domain in args.domains: domain_matrix = matrices.loc[domain].dropna(how='all', axis=1) normalised_matrix = normalise_along_axis(domain_matrix, 0) plot_heatmap(normalised_matrix) print normalised_matrix.T.fillna(0).to_csv(sep='\t') try: input("Press ENTER to close the windows") except: pass
from compare_lots_of_clusters import plot_heatmap, normalise_along_axis if __name__ == '__main__': original_data = pd.read_csv("original_seven_genomes_domains.table.log", delimiter='\t') original_data.columns = ['gene', 'CIDRa', 'DBLa'] plot_heatmap(original_data.groupby(['CIDRa', 'DBLa']).apply(len).unstack())
def get_cluster_sizes_and_order(classification_data): logging.debug("Counting cluster sizes") data = classification_data.groupby('subdomain').aggregate(len) data = data.reset_index() data.columns = ['subdomain', 'count'] return data.sort('subdomain') if __name__ == '__main__': logging.basicConfig(format='[%(asctime)s] %(message)s', level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('classification_summary', type=argparse.FileType('r')) parser.add_argument('distance_matrix', type=argparse.FileType('r')) args = parser.parse_args() logging.debug("Loading data") classification_data = load_classification_data(args.classification_summary) distance_matrix = pd.read_csv(args.distance_matrix, delimiter='\t') classification_data = sort_by_subdomain(classification_data) sample_order = classification_data['name'] cluster_counts = get_cluster_sizes_and_order(classification_data) relevant_columns = get_matrix_columns(distance_matrix, sample_order) relevant_data = get_relevant_data(relevant_columns, sample_order) cluster_counts.to_csv(sys.stdout, sep='\t', index=False) logging.debug("Presenting graph") plot_heatmap(relevant_data, block=True)
return data.loc[sample_order, :] def get_cluster_sizes_and_order(classification_data): logging.debug("Counting cluster sizes") data = classification_data.groupby('subdomain').aggregate(len) data = data.reset_index() data.columns = ['subdomain', 'count'] return data.sort('subdomain') if __name__ == '__main__': logging.basicConfig(format='[%(asctime)s] %(message)s', level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('classification_summary', type=argparse.FileType('r')) parser.add_argument('distance_matrix', type=argparse.FileType('r')) args = parser.parse_args() logging.debug("Loading data") classification_data = load_classification_data(args.classification_summary) distance_matrix = pd.read_csv(args.distance_matrix, delimiter='\t') classification_data = sort_by_subdomain(classification_data) sample_order = classification_data['name'] cluster_counts = get_cluster_sizes_and_order(classification_data) relevant_columns = get_matrix_columns(distance_matrix, sample_order) relevant_data = get_relevant_data(relevant_columns, sample_order) cluster_counts.to_csv(sys.stdout, sep='\t', index=False) logging.debug("Presenting graph") plot_heatmap(relevant_data, block=True)