def plot_vertex_weight_histogram(max_distance): """ This function plots one histogram for the vertex weights and another for the chromosomes with the breaks :param max_distance: :return: """ from matplotlib import pyplot as plt # Iterate over the files all_weights = np.array([], dtype=np.float) all_labels = np.array([]) for file_name in os.listdir(data_path): # Load the brakes breaks, list_of_pairs = load_breaks(os.path.join(data_path, file_name)) if len(breaks) == 0: print 'WARNING: Empty data file', file_name shutil.move(os.path.join(data_path, file_name), os.path.join('../../data/raw_original_data/empty_files', file_name)) continue # Generate the vertices and edges adjacency_matrix, vertex_labels, vertex_ranges, vertex_weights = generateGraph(breaks, list_of_pairs, max_distance) all_weights = np.append(all_weights, np.array(vertex_weights)) all_labels = np.append(all_labels, np.array(vertex_labels)) from collections import Counter all_w_dict = Counter(all_weights.astype(np.float)) little = {k: all_w_dict[k] for k in all_w_dict.keys() if all_w_dict[k] >= 100} print(all_w_dict) all_lab_dict = sorted(Counter(all_labels).items(), key=lambda (k, v): (natural_key(k), v)) x, y = zip(*all_lab_dict) # print np.array(all_w_dict.values()).astype(np.float)/np.max(all_w_dict.values()) plt.figure() plt.bar(list(all_w_dict.keys()), np.array(all_w_dict.values()).astype(np.float) / np.max(all_w_dict.values()), color='b') plt.title('Weights ' + str(max_distance)) plt.savefig('../../data/plots/test_weights' + str(max_distance) + '.png') plt.figure() plt.bar(list(little.keys()), np.array(little.values()).astype(np.float) / np.max(little.values()), color='b') plt.title('Weights little ' + str(max_distance)) plt.savefig('../../data/plots/test_weights_little' + str(max_distance) + '.png') plt.figure() plt.bar(x, y, color='g') plt.title('Labels ' + str(max_distance)) plt.savefig('../../data/plots/test_labels' + str(max_distance) + '.png') # plt.show() plt.close()
def plot_one_file(file_name, max_distance=1000): # Load the brakes breaks, list_of_pairs = load_breaks(os.path.join(data_path, file_name)) if len(breaks) == 0: print 'WARNING: Empty data file', file_name # Generate the vertices and edges adjacency_matrix, vertex_labels, vertex_ranges, vertex_weights = generateGraph(breaks, list_of_pairs, max_distance) # Create the graph g = generateNXGraph(adjacency_matrix, vertex_labels, vertex_ranges, vertex_weights, self_links=False, connected_only=True) # Print the graph print 'Showing graph of ', file_name printGraph(g, name=file_name, visualize=False, show_vertex_weights=False)
def plot_all(max_distance): # Iterate over the files for file_name in os.listdir(data_path): # Load the brakes breaks, list_of_pairs = load_breaks(os.path.join(data_path, file_name)) if len(breaks) == 0: print 'WARNING: Empty data file', file_name continue # Generate the vertices and edges adjacency_matrix, vertex_labels, vertex_ranges, vertex_weights = generateGraph(breaks, list_of_pairs, max_distance) # Create the graph g = generateNXGraph(adjacency_matrix, vertex_labels, vertex_ranges, vertex_weights, self_links=False, connected_only=True) # Print the graph print 'Showing graph of ', file_name printGraph(g, show_vertex_weights=False) print vertex_ranges
def generate_all_patient_graphs(max_distance, data_path, output_path): subgraphs = [] # Iterate over the files count = 0 for filename in os.listdir(data_path): breaks, list_of_pairs = load_breaks(os.path.join(data_path, filename)) if len(breaks) == 0: # print 'WARNING: Empty data file',file_name continue adjacency_matrix, vertex_labels, vertex_ranges, vertex_weights = generateGraph( breaks, list_of_pairs, max_distance) g = generateNXGraph(adjacency_matrix, vertex_labels, vertex_ranges, vertex_weights, self_links=False, connected_only=True) candidates = list(nx.connected_component_subgraphs(g)) for c in candidates: if len(c.nodes()) >= 3 and len(c.nodes()) >= 3: print count, filename subgraphs.append(c) count += 1 # Iterate over subgraphs and store with open(output_path + '/gspan_subgraphs_w' + str(max_distance) + '.txt', 'w') as f: counter = 0 for g in subgraphs: f.write('t # ' + str(counter) + '\n') counter += 1 # Iterate over vertices for v in g.nodes(): f.write('v ' + str(v) + ' 2\n') # Iterate over all edges for edge in g.edges(): f.write('e ' + str(edge[0]) + ' ' + str(edge[1]) + ' 2\n')
def generate_all_patient_graphs(max_distance): """ Generates graphs according to the window sizes for all files. Decompose these into subgraphs, according to connected components, and stores in a format compatible for gSpan. """ subgraphs = [] # Iterate over the files count = 0 for filename in os.listdir(PATIENTS_PATH): breaks, list_of_pairs = load_breaks(os.path.join(PATIENTS_PATH, filename)) if len(breaks) == 0: # print 'WARNING: Empty data file',file_name continue adjacency_matrix, vertex_labels, vertex_ranges, vertex_weights = generateGraph(breaks, list_of_pairs, max_distance) g = generateNXGraph(adjacency_matrix, vertex_labels, vertex_ranges, vertex_weights, self_links=False, connected_only=True) candidates = list(nx.connected_component_subgraphs(g)) for c in candidates: if len(c.nodes()) >= 3 and len(c.nodes()) >= 3: print count, filename subgraphs.append(c) count += 1 # Iterate over subgraphs and store with open(GSPAN_COMPATIBLE_GRAPHS_PATH + 'gspan_subgraphs_w' + str(max_distance) + '.txt', 'w') as f: counter = 0 for g in subgraphs: f.write('t # ' + str(counter) + '\n') counter += 1 # Iterate over vertices for v in g.nodes(): f.write('v ' + str(v) + ' 2\n') # Iterate over all edges for edge in g.edges(): f.write('e ' + str(edge[0]) + ' ' + str(edge[1]) + ' 2\n')
#counter_empty = 0 cliques_3 = [] cliques_4 = [] data_path = '../vcfshorts/allfiles' #data_path = '../vcfshorts/chromexsamples' #file_name = 'fca3f7d0-2231-661c-e040-11ac0c4832fd.vcf.tsv' #file_name = 'feccee20-a62d-4152-b832-b9fdaca87a61.vcf_chromex.tsv' for file_name in os.listdir(data_path): if file_name.split('.')[0] in cancer_types[t]: breaks, list_of_pairs = load_breaks(os.path.join(data_path,file_name)) #if len(breaks) == 0: # #print 'WARNING: Empty data file',file_name # counter_empty +=1 adjacency_matrix, vertex_labels, vertex_ranges = generateGraph(breaks, list_of_pairs, max_distance=100) counter += 1 if len(vertex_labels)!=0: c3,c4 = getCliques(adjacency_matrix,vertex_labels) cliques_3.append(c3) cliques_4.append(c4) else: cliques_3.append(0) cliques_4.append(0) #print 'Cancer type',t, 'Mean/std of 3 cliques',np.asarray(cliques_3).mean(),np.asarray(cliques_3).std(),'Mean/std of 4 cliques',np.asarray(cliques_4).mean(),np.asarray(cliques_4).std() print t, counter, np.around(np.asarray(cliques_3).mean(),decimals=2),np.around(np.asarray(cliques_3).std(),decimals=2),np.around(np.asarray(cliques_4).mean(),decimals=2),np.around(np.asarray(cliques_4).std(),decimals=2) counter_all += counter #counter_all_empty += counter_empty cliques_all_3 += cliques_3 cliques_all_4 += cliques_4