Esempio n. 1
0
def plot_vertex_weight_histogram(max_distance):
    """
    This function plots one histogram for the vertex weights and another for the chromosomes with the breaks
    :param max_distance:
    :return:
    """
    from matplotlib import pyplot as plt

    # Iterate over the files
    all_weights = np.array([], dtype=np.float)
    all_labels = np.array([])
    for file_name in os.listdir(data_path):
        # Load the brakes
        breaks, list_of_pairs = load_breaks(os.path.join(data_path, file_name))
        if len(breaks) == 0:
            print 'WARNING: Empty data file', file_name
            shutil.move(os.path.join(data_path, file_name), os.path.join('../../data/raw_original_data/empty_files', file_name))
            continue
        # Generate the vertices and edges
        adjacency_matrix, vertex_labels, vertex_ranges, vertex_weights = generateGraph(breaks, list_of_pairs,
                                                                                       max_distance)
        all_weights = np.append(all_weights, np.array(vertex_weights))
        all_labels = np.append(all_labels, np.array(vertex_labels))

    from collections import Counter

    all_w_dict = Counter(all_weights.astype(np.float))
    little = {k: all_w_dict[k] for k in all_w_dict.keys() if all_w_dict[k] >= 100}

    print(all_w_dict)

    all_lab_dict = sorted(Counter(all_labels).items(), key=lambda (k, v): (natural_key(k), v))
    x, y = zip(*all_lab_dict)

    # print np.array(all_w_dict.values()).astype(np.float)/np.max(all_w_dict.values())

    plt.figure()
    plt.bar(list(all_w_dict.keys()), np.array(all_w_dict.values()).astype(np.float) / np.max(all_w_dict.values()),
            color='b')
    plt.title('Weights ' + str(max_distance))
    plt.savefig('../../data/plots/test_weights' + str(max_distance) + '.png')
    plt.figure()
    plt.bar(list(little.keys()), np.array(little.values()).astype(np.float) / np.max(little.values()),
            color='b')
    plt.title('Weights little ' + str(max_distance))
    plt.savefig('../../data/plots/test_weights_little' + str(max_distance) + '.png')
    plt.figure()
    plt.bar(x, y, color='g')
    plt.title('Labels ' + str(max_distance))
    plt.savefig('../../data/plots/test_labels' + str(max_distance) + '.png')
    # plt.show()
    plt.close()
Esempio n. 2
0
def plot_one_file(file_name, max_distance=1000):
    # Load the brakes
    breaks, list_of_pairs = load_breaks(os.path.join(data_path, file_name))
    if len(breaks) == 0:
        print 'WARNING: Empty data file', file_name

    # Generate the vertices and edges
    adjacency_matrix, vertex_labels, vertex_ranges, vertex_weights = generateGraph(breaks, list_of_pairs, max_distance)
    # Create the graph
    g = generateNXGraph(adjacency_matrix, vertex_labels, vertex_ranges, vertex_weights, self_links=False,
                        connected_only=True)
    # Print the graph
    print 'Showing graph of ', file_name
    printGraph(g, name=file_name, visualize=False, show_vertex_weights=False)
Esempio n. 3
0
def plot_all(max_distance):
    # Iterate over the files
    for file_name in os.listdir(data_path):
        # Load the brakes
        breaks, list_of_pairs = load_breaks(os.path.join(data_path, file_name))
        if len(breaks) == 0:
            print 'WARNING: Empty data file', file_name
            continue
        # Generate the vertices and edges
        adjacency_matrix, vertex_labels, vertex_ranges, vertex_weights = generateGraph(breaks, list_of_pairs,
                                                                                       max_distance)
        # Create the graph
        g = generateNXGraph(adjacency_matrix, vertex_labels, vertex_ranges, vertex_weights, self_links=False,
                            connected_only=True)
        # Print the graph
        print 'Showing graph of ', file_name
        printGraph(g, show_vertex_weights=False)

        print vertex_ranges
Esempio n. 4
0
def generate_all_patient_graphs(max_distance, data_path, output_path):
    subgraphs = []
    # Iterate over the files
    count = 0
    for filename in os.listdir(data_path):
        breaks, list_of_pairs = load_breaks(os.path.join(data_path, filename))
        if len(breaks) == 0:
            # print 'WARNING: Empty data file',file_name
            continue
        adjacency_matrix, vertex_labels, vertex_ranges, vertex_weights = generateGraph(
            breaks, list_of_pairs, max_distance)
        g = generateNXGraph(adjacency_matrix,
                            vertex_labels,
                            vertex_ranges,
                            vertex_weights,
                            self_links=False,
                            connected_only=True)
        candidates = list(nx.connected_component_subgraphs(g))
        for c in candidates:
            if len(c.nodes()) >= 3 and len(c.nodes()) >= 3:
                print count, filename
                subgraphs.append(c)
                count += 1

    # Iterate over subgraphs and store
    with open(output_path + '/gspan_subgraphs_w' + str(max_distance) + '.txt',
              'w') as f:
        counter = 0
        for g in subgraphs:
            f.write('t # ' + str(counter) + '\n')
            counter += 1
            # Iterate over vertices
            for v in g.nodes():
                f.write('v ' + str(v) + ' 2\n')
            # Iterate over all edges
            for edge in g.edges():
                f.write('e ' + str(edge[0]) + ' ' + str(edge[1]) + ' 2\n')
def generate_all_patient_graphs(max_distance):
    """
    Generates graphs according to the window sizes for all files.
    Decompose these into subgraphs, according to connected components, and stores in a format compatible for gSpan.
    """
    subgraphs = []
    # Iterate over the files
    count = 0
    for filename in os.listdir(PATIENTS_PATH):
        breaks, list_of_pairs = load_breaks(os.path.join(PATIENTS_PATH, filename))
        if len(breaks) == 0:
            # print 'WARNING: Empty data file',file_name
            continue
        adjacency_matrix, vertex_labels, vertex_ranges, vertex_weights = generateGraph(breaks, list_of_pairs,
                                                                                       max_distance)
        g = generateNXGraph(adjacency_matrix, vertex_labels, vertex_ranges, vertex_weights, self_links=False,
                            connected_only=True)
        candidates = list(nx.connected_component_subgraphs(g))
        for c in candidates:
            if len(c.nodes()) >= 3 and len(c.nodes()) >= 3:
                print count, filename
                subgraphs.append(c)
                count += 1

    # Iterate over subgraphs and store
    with open(GSPAN_COMPATIBLE_GRAPHS_PATH + 'gspan_subgraphs_w' + str(max_distance) + '.txt', 'w') as f:
        counter = 0
        for g in subgraphs:
            f.write('t # ' + str(counter) + '\n')
            counter += 1
            # Iterate over vertices
            for v in g.nodes():
                f.write('v ' + str(v) + ' 2\n')
            # Iterate over all edges
            for edge in g.edges():
                f.write('e ' + str(edge[0]) + ' ' + str(edge[1]) + ' 2\n')
    #counter_empty = 0
    cliques_3 = []
    cliques_4 = []
    data_path = '../vcfshorts/allfiles'
    #data_path = '../vcfshorts/chromexsamples'
    #file_name = 'fca3f7d0-2231-661c-e040-11ac0c4832fd.vcf.tsv'
    #file_name = 'feccee20-a62d-4152-b832-b9fdaca87a61.vcf_chromex.tsv'
    
    for file_name in os.listdir(data_path):
        if file_name.split('.')[0] in cancer_types[t]:
            breaks, list_of_pairs = load_breaks(os.path.join(data_path,file_name))
            #if len(breaks) == 0:
            #    #print 'WARNING: Empty data file',file_name
            #    counter_empty +=1
            
            adjacency_matrix, vertex_labels, vertex_ranges = generateGraph(breaks, list_of_pairs, max_distance=100)
            counter += 1
            if len(vertex_labels)!=0:
                c3,c4 = getCliques(adjacency_matrix,vertex_labels)
                cliques_3.append(c3)
                cliques_4.append(c4)
            else:
                cliques_3.append(0)
                cliques_4.append(0)

    #print 'Cancer type',t, 'Mean/std of 3 cliques',np.asarray(cliques_3).mean(),np.asarray(cliques_3).std(),'Mean/std of 4 cliques',np.asarray(cliques_4).mean(),np.asarray(cliques_4).std()
    print t, counter, np.around(np.asarray(cliques_3).mean(),decimals=2),np.around(np.asarray(cliques_3).std(),decimals=2),np.around(np.asarray(cliques_4).mean(),decimals=2),np.around(np.asarray(cliques_4).std(),decimals=2)
    counter_all += counter
    #counter_all_empty += counter_empty
    cliques_all_3 += cliques_3
    cliques_all_4 += cliques_4