Exemple #1
0
def plot_distance():
    df_path = nt.get_path() + '/data/Good_et_al/network_features.txt'
    df = pd.read_csv(df_path, sep='\t', header='infer')

    fig = plt.figure()
    x = df.N.values
    y = df.d_mean.values
    #plt.scatter(x, y, marker = "o", edgecolors='none', c = '#87CEEB', s = 120, zorder=3)
    plt.scatter(x, y, c='#175ac6', marker = 'o', s = 120, \
        edgecolors='#244162', linewidth = 0.6, alpha = 0.9, zorder=3)

    x_range = list(range(10, max(x)))
    barabasi_albert_range = [(np.log(i) / np.log(np.log(i))) for i in x_range]
    random_range = [np.log(i) for i in x_range]
    plt.plot(x_range, barabasi_albert_range, c='r', lw=2.5, ls='--')
    plt.plot(x_range, random_range, c='k', lw=2.5, ls='--')

    plt.xlabel('Network size, ' + r'$N$', fontsize=18)
    plt.ylabel('Mean distance, ' + r'$\left \langle d \right \rangle$',
               fontsize=16)
    #plt.xscale('log')
    #plt.ylim(0.05, 1.5)

    fig.tight_layout()
    fig.savefig(nt.get_path() + '/figs/good_N_vs_d.png',
                bbox_inches="tight",
                pad_inches=0.4,
                dpi=600)
    plt.close()
 def clean_tenaillon_et_al(self):
     df_in = nt.get_path() + 'data/Tenaillon_et_al/1212986tableS2.csv'
     df_out = open(
         nt.get_path() + 'data/Tenaillon_et_al/1212986tableS2_clean.csv',
         'w')
     category_dict = {}
     header = ['Lines', 'Position', 'Type', 'Change', 'Genic_status', 'Gene_nb', \
                 'Gene_name', 'Effect', 'Site_affected', 'Length', \
                 'Genic_type', 'Gene_nb_type', 'Gene_name_type', \
                 'Effect_type', 'Site_affected_type', 'Length_type']
     df_out.write(','.join(header) + '\n')
     # For genic, check whether genic + '_' + 7th column value in dict, if not, select
     # 'Genic' as key
     head_type = { 'Genic': ['Genic', 'Gene_nb', 'Gene_Name', 'Effect', 'codon_affected', 'gene_length_in_codon'] , \
             'Genic_Large_Deletion': ['Genic', 'Gene_nb', 'Gene_Name', 'Large_Deletion', 'bp_deleted_in_Gene', 'gene_length_bp' ], \
             'Genic_RNA':  ['Genic', 'Gene_nb', 'Gene_Name', 'RNA', 'bp_affected', 'gene_length_bp']  ,\
             'Intergenic_Intergenic': ['Intergenic', 'Previous_Gene_nb', 'Previous_Gene_Name_distance_bp', 'Effect', 'Next_Gene_Name_distance_bp', 'Intergenic_type'], \
             'Multigenic_Multigenic': ['Multigenic', 'First_Gene_nb', 'First_Gene_Name', 'Effect', 'Last_Gene_nb', 'Last_Gene_Name']}
     for i, line in enumerate(open(df_in, 'r')):
         line = line.strip().split(',')
         if (len(line) == 0) or (i in range(0, 5)) or (len(line[0]) == 0):
             continue
         else:
             line_type = line[4] + '_' + line[7]
             if line_type in head_type:
                 line_new = line + head_type[line_type]
             else:
                 line_new = line + head_type['Genic']
             df_out.write(','.join(line_new) + '\n')
     df_out.close()
def get_likelihood_matrices():
    df_good_path = nt.get_path() + '/data/Good_et_al/gene_by_pop.txt'
    df_good =  pd.read_csv(df_good_path, sep = '\t', header = 'infer', index_col = 0)
    df_good_delta = nt.likelihood_matrix(df_good, 'Good_et_al').get_likelihood_matrix()
    df_good_delta_out = nt.get_path() + '/data/Good_et_al/gene_by_pop_delta.txt'
    df_good_delta.to_csv(df_good_delta_out, sep = '\t', index = True)

    df_good_poly_path = nt.get_path() + '/data/Good_et_al/gene_by_pop_poly.txt'
    df_good_poly =  pd.read_csv(df_good_poly_path, sep = '\t', header = 'infer', index_col = 0)
    df_good_poly_delta = nt.likelihood_matrix(df_good_poly, 'Good_et_al').get_likelihood_matrix()
    df_good_poly_delta_out = nt.get_path() + '/data/Good_et_al/gene_by_pop_poly_delta.txt'
    df_good_poly_delta.to_csv(df_good_poly_delta_out, sep = '\t', index = True)
Exemple #4
0
def plot_t_k_node():
    network_dir = nt.get_path() + '/data/Good_et_al/networks_naive/'
    node_dict = {}
    for filename in os.listdir(network_dir):
        df = pd.read_csv(network_dir + filename,
                         sep='\t',
                         header='infer',
                         index_col=0)
        gens = filename.split('.')
        time = re.split('[_.]', filename)[1]
        node_dict[time] = {}
        for index, row in df.iterrows():
            k_row = sum(i != 0 for i in row.values) - 1
            node_dict[time][index] = k_row

    node_df = pd.DataFrame.from_dict(node_dict)
    idx = node_df.sum(axis=1).sort_values(ascending=False).head(10).index
    node_df_idx = node_df.ix[idx]
    #node_df_idx = node_df.ix[ ['malT'] ]
    colors = ['firebrick', 'darkorange', 'gold', 'darkgreen', 'palegreen', \
                'navy', 'royalblue', 'black', 'teal', 'dimgrey']
    color_count = 0

    fig = plt.figure()
    nodes = node_df.index.values
    for index, row in node_df_idx.iterrows():
        print(index)
        row = row.dropna()
        row_x = [int(x) for x in row.index.values]
        row_y = row.values
        row_xy = list(zip(row_x, row_y))
        row_xy.sort(key=lambda tup: tup[1])  # sorts in place
        row_x_sort = [x[0] for x in row_xy]
        row_y_sort = [x[1] for x in row_xy]
        plt.scatter(row_x_sort, row_y_sort, c=colors[color_count], marker = 'o', s = 120, \
            edgecolors='k', linewidth = 0.6, alpha = 0.9)

        color_count += 1

    plt.xlabel('Time (generations)', fontsize=18)
    plt.ylabel('k(t)', fontsize=18)
    plt.xlim(2500, 60000)
    plt.ylim(1, 200)
    plt.xscale('log')
    plt.yscale('log')
    fig.tight_layout()
    fig.savefig(nt.get_path() + '/figs/good_t_vs_k_node.png',
                bbox_inches="tight",
                pad_inches=0.4,
                dpi=600)
    plt.close()
Exemple #5
0
def plot_edge_dist():
    df_path = nt.get_path() + '/data/Tenaillon_et_al/network.txt'
    df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
    k_list = []
    for index, row in df.iterrows():
        k_row = sum(i > 0 for i in row.values) - 1
        if k_row > 0:
            k_list.append(k_row)

    k_count = dict(Counter(k_list))
    k_count = {
        k: v / total
        for total in (sum(k_count.values()), ) for k, v in k_count.items()
    }
    #x = np.log10(list(k_count.keys()))
    #y = np.log10(list(k_count.values()))
    k_mean = np.mean(k_list)
    print("mean k = " + str(k_mean))
    print("N = " + str(df.shape[0]))
    x = list(k_count.keys())
    y = list(k_count.values())

    x_poisson = list(range(1, 100))
    y_poisson = [(math.exp(-k_mean) * ((k_mean**k) / math.factorial(k)))
                 for k in x_poisson]

    fig = plt.figure()
    plt.scatter(x,
                y,
                marker="o",
                edgecolors='none',
                c='darkgray',
                s=120,
                zorder=3)
    plt.plot(x_poisson, y_poisson)
    plt.xlabel("Number of edges, k", fontsize=16)
    plt.ylabel("Frequency", fontsize=16)
    plt.xscale('log')
    plt.yscale('log')
    plt.ylim(0.001, 1)

    fig.tight_layout()
    fig.savefig(nt.get_path() + '/figs/edge_dist.png',
                bbox_inches="tight",
                pad_inches=0.4,
                dpi=600)
    plt.close()
def get_naive_good_network():
    out_directory = nt.get_path() + '/data/Good_et_al/networks_naive'
    df_path = nt.get_path() + '/data/Good_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0)
    to_exclude = nt.complete_nonmutator_lines()
    to_exclude.append('p5')
    df_nonmut = df[df.index.str.contains('|'.join( to_exclude))]
    # remove columns with all zeros
    df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)]
    time_points = [ int(x.split('_')[1]) for x in df_nonmut.index.values]
    time_points_set = sorted(list(set([ int(x.split('_')[1]) for x in df_nonmut.index.values])))
    for time_point in time_points_set:
        print(time_point)
        df_time_point = df_nonmut[df_nonmut.index.to_series().str.contains('_' + str(time_point))]
        df_time_point = df_time_point.loc[:, (df_time_point != 0).any(axis=0)]
        network_tp = nt.reconstruct_naive_network(df_time_point)
        network_tp.to_csv(out_directory + '/network_' + str(time_point) + '.txt', sep = '\t', index = True)
def run_network_permutation_rndm(iter = 1000, include_kmax = True):
    df_path = nt.get_path() + '/data/Tenaillon_et_al/network.txt'
    df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0)
    df = df.drop('kpsD', axis=0)
    df = df.drop('kpsD', axis=1)
    df_out = open(nt.get_path() + '/data/Tenaillon_et_al/permute_network_rndm_nokmax.txt', 'w')
    df_out.write('\t'.join(['Iteration', 'k_max', 'k_mean', 'C_mean', 'C_mean_no1or2', 'd_mean']) + '\n')

    # get m using likelihood
    for i in range(iter):
        print("Iteration " + str(i))
        df_rndm = nt.get_random_network_edges(df)
        k_i_rndm = []
        C_i_rndm_list = []

        for index, row in df_rndm.iterrows():
            k_row = sum(i != 0 for i in row.values) - 1
            k_i_rndm.append(k_row)
            if (k_row == 0) or (k_row == 1):
                C_i_rndm_list.append(float(0))
            else:
                non_zero = row.nonzero()
                row_non_zero = row[non_zero[0]]
                # drop the node
                row_non_zero = row_non_zero.drop(labels = [index])
                L_i = 0
                for index_gene, gene in row_non_zero.iteritems():
                    row_non_zero_list = row_non_zero.index.tolist()
                    row_non_zero_list.remove(index_gene)
                    df_subset = df.loc[[index_gene]][row_non_zero_list]
                    L_i += sum(sum(i != 0 for i in df_subset.values))
                # we don't multiply L_i by a factor of 2 bc we're double counting edges
                C_i =  L_i  / (k_row * (k_row-1) )
                C_i_rndm_list.append(C_i)

        k_max = max(k_i_rndm)
        k_mean = np.mean(k_i_rndm)
        C_mean = np.mean(C_i_rndm_list)
        C_mean_no1or2 = np.mean([l for l in C_i_rndm_list if l > 0])
        distance_df = nt.networkx_distance(df_rndm)

        df_out.write('\t'.join([str(i), str(k_max), str(k_mean), str(C_mean), str(C_mean_no1or2), str(distance_df)]) + '\n')

    df_out.close()
Exemple #8
0
def plot_cluster_dist():
    df_path = nt.get_path() + '/data/Tenaillon_et_al/network_CCs.txt'
    df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
    k_count = dict(Counter(df.C_i.values))
    k_count = {
        k: v / total
        for total in (sum(k_count.values()), ) for k, v in k_count.items()
    }
    #x = np.log10(list(k_count.keys()))
    #y = np.log10(list(k_count.values()))
    # cluster kde
    C_i = df.C_i.values
    grid_ = GridSearchCV(KernelDensity(),
                         {'bandwidth': np.linspace(0.1, 10, 50)},
                         cv=20)  # 20-fold cross-validation
    grid_.fit(C_i[:, None])
    x_grid_ = np.linspace(0, 2.5, 1000)
    kde_ = grid_.best_estimator_
    pdf_ = np.exp(kde_.score_samples(x_grid_[:, None]))
    pdf_ = [x / sum(pdf_) for x in pdf_]

    x = list(k_count.keys())
    y = list(k_count.values())

    #x_poisson = list(range(1, 100))
    #y_poisson = [(math.exp(-k_mean) * ( (k_mean ** k)  /  math.factorial(k) )) for k in x_poisson]

    fig = plt.figure()
    #plt.scatter(x, y, marker = "o", edgecolors='none', c = 'darkgray', s = 120, zorder=3)
    plt.plot(x_grid_, pdf_)
    plt.ylabel("Clustering coefficient, " + r'$C_{i}$', fontsize=16)
    plt.xlabel("Number of edges, " + r'$k$', fontsize=16)
    #plt.xscale('log')
    #plt.yscale('log')
    #plt.ylim(0, 1)
    fig.tight_layout()
    fig.savefig(nt.get_path() + '/figs/C_dist.png',
                bbox_inches="tight",
                pad_inches=0.4,
                dpi=600)
    plt.close()
def run_network_permutation_ba():
    df_path = nt.get_path() + '/data/Tenaillon_et_al/network.txt'
    df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0)
    df_out = open(nt.get_path() + '/data/Tenaillon_et_al/permute_network_ba.txt', 'w')
    df_out.write('\t'.join(['Iteration', 'k_max', 'k_mean', 'C_mean', 'C_mean_no1or2', 'd_mean']) + '\n')

    k_list = []
    for index, row in df.iterrows():
        k_row = sum(i != 0 for i in row.values) - 1
        if k_row > 0:
            k_list.append(k_row)
    count_k_list = Counter(k_list)
    count_k_list_sum = sum(count_k_list.values())
    x = count_k_list.keys()
    y = [(i / count_k_list_sum) for i in count_k_list.values()]
    count_k_list.pop(max(x), None)
    x_no_max = list(count_k_list.keys())
    y_no_max = [(i / (count_k_list_sum-1)) for i in count_k_list.values()]


    model_no_max = nt.continuumBarabasiAlbert(x_no_max, y_no_max)
    m_start = [1, 2, 3, 4]
    z_start = [-2,-0.5]
    results = []
    for m in m_start:
        for z in z_start:
            start_params = [m, z]
            result = model_no_max.fit(start_params = start_params)
            results.append(result)
    AICs = [result.aic for result in results]
    best = results[AICs.index(min(AICs))]
    best_CI_FIC = nt.CI_FIC(best)
    best_CI = best.conf_int()
    best_params = best.params
    print(best_params)

    #barabasi_albert_range_C_ll = nt.cluster_BA(np.sort(x_C), best_params[0])



    df_out.close()
def get_good_network_features(reconstruct = 'naive'):
    if reconstruct == 'naive':
        directory = nt.get_path() + '/data/Good_et_al/networks_naive/'
        df_out = open(nt.get_path() + '/data/Good_et_al/network_naive_features.txt', 'w')
        df_clust_path = nt.get_path() + '/data/Good_et_al/network_naive_CCs.txt'
    elif reconstruct == 'BIC':
        directory = nt.get_path() + '/data/Good_et_al/networks_BIC/'
        df_out = open(nt.get_path() + '/data/Good_et_al/network_BIC_features.txt', 'w')
        df_clust_path = nt.get_path() + '/data/Good_et_al/network_BIC_CCs.txt'

    df_out_columns = ['Generations', 'N', 'k_max', 'k_mean', 'C_mean', 'C_mean_no1or2', 'd_mean']
    df_out.write('\t'.join(df_out_columns) + '\n')
    df_clust = pd.read_csv(df_clust_path, sep = '\t', header = 'infer')#, index_col = 0)
    for filename in os.listdir(directory):
        if filename == '.DS_Store':
            continue
        df = pd.read_csv(directory + filename, sep = '\t', header = 'infer', index_col = 0)
        gens = filename.split('.')
        time = re.split('[_.]', filename)[1]
        df_clust_time = df_clust.loc[df_clust['Generations'] == int(time)]
        N = df.shape[0]
        k_max = max(df_clust_time.k_i.values)
        k_mean = np.mean(df_clust_time.k_i.values)
        C_mean = np.mean(df_clust_time.C_i.values)
        C_mean_no1or2 = np.mean(df_clust_time.loc[df_clust_time['k_i'] >= 2].C_i.values)

        distance_df = nt.networkx_distance(df)
        print(time)
        print(distance_df)

        row = [str(time), str(N), str(k_max), str(k_mean), str(C_mean), str(C_mean_no1or2), str(distance_df)]
        df_out.write('\t'.join(row) + '\n')

    df_out.close()
Exemple #11
0
def plot_nodes_over_time():
    directory = nt.get_path() + '/data/Good_et_al/networks_BIC/'
    time_nodes = []
    for filename in os.listdir(directory):
        df = pd.read_csv(directory + filename,
                         sep='\t',
                         header='infer',
                         index_col=0)
        gens = filename.split('.')
        time = re.split('[_.]', filename)[1]
        time_nodes.append((int(time), df.shape[0]))
    time_nodes_sorted = sorted(time_nodes, key=lambda tup: tup[0])
    x = [i[0] for i in time_nodes_sorted]
    y = [i[1] for i in time_nodes_sorted]

    x_pred = list(set(x))
    x_pred.sort()
    y_pred = [min(y) + x_pred_i + 1 for x_pred_i in list(range(len(x_pred)))]

    fig = plt.figure()
    plt.scatter(x,
                y,
                marker="o",
                edgecolors='#244162',
                c='#175ac6',
                s=120,
                zorder=3)
    plt.plot(x_pred, y_pred)
    plt.xlabel("Time (generations)", fontsize=18)
    plt.ylabel('Network size, ' + r'$N$', fontsize=18)
    plt.ylim(5, 500)
    plt.yscale('log')

    fig.tight_layout()
    fig.savefig(nt.get_path() + '/figs/good_N_vs_time.png',
                bbox_inches="tight",
                pad_inches=0.4,
                dpi=600)
    plt.close()
    def pop_by_gene_tenaillon(self):
        pop_by_gene_dict = {}
        gene_size_dict = {}
        df_in = nt.get_path() + 'data/Tenaillon_et_al/1212986tableS2_clean.csv'
        for i, line in enumerate(open(df_in, 'r')):
            line_split = line.strip().split(',')
            if (line_split[4] == 'Intergenic') or \
            (i == 0) or \
            (line_split[9].isdigit() == False):
                continue
            gene_length_units = line_split[-1]
            gene_name = line_split[6]
            pop_name = line_split[0]
            if gene_length_units == 'gene_length_in_codon':
                gene_length = int(line_split[9]) * 3
            elif gene_length_units == 'gene_length_bp':
                gene_length = int(line_split[9])
            if gene_name not in gene_size_dict:
                gene_size_dict[gene_name] = gene_length

            if gene_name not in pop_by_gene_dict:
                pop_by_gene_dict[gene_name] = {}

            if pop_name not in pop_by_gene_dict[gene_name]:
                pop_by_gene_dict[gene_name][pop_name] = 1
            else:
                pop_by_gene_dict[gene_name][pop_name] += 1

        df = pd.DataFrame.from_dict(pop_by_gene_dict)
        df = df.fillna(0)
        # remove rows and columns with all zeros
        #df = df.loc[(df.sum(axis=1) != 0), (df.sum(axis=0) != 0)]
        df_out = nt.get_path() + 'data/Tenaillon_et_al/gene_by_pop.txt'
        df.to_csv(df_out, sep='\t', index=True)
        gene_size_dict_out = nt.get_path(
        ) + 'data/Tenaillon_et_al/gene_size_dict.txt'
        with open(gene_size_dict_out, 'wb') as handle:
            pickle.dump(gene_size_dict, handle)
    def reformat_convergence_matrix(self, mut_type='F'):
        conv_dict = self.parse_convergence_matrix(
            nt.get_path() + "data/Good_et_al/gene_convergence_matrix.txt")
        time_points = []
        new_dict = {}
        for gene_name, gene_data in conv_dict.items():
            for pop_name, mutations in gene_data['mutations'].items():
                for mutation in mutations:
                    time = int(mutation[0])
                    time_points.append(time)
        time_points = sorted(list(set(time_points)))
        for gene_name, gene_data in conv_dict.items():
            if gene_name not in new_dict:
                new_dict[gene_name] = {}
            for pop_name, mutations in gene_data['mutations'].items():
                if len(mutations) == 0:
                    continue

                mutations.sort(key=lambda tup: tup[0])
                # keep only fixed mutations
                #{'A':0,'E':1,'F':2,'P':3}
                if mut_type == 'F':
                    mutations = [x for x in mutations if int(x[1]) == 2]
                elif mut_type == 'P':
                    mutations = [x for x in mutations
                                 if (int(x[1]) == 3)]  #or (int(x[1]) == 0)]
                else:
                    print("Argument mut_type not recognized")

                if len(mutations) == 0:
                    continue
                for mutation in mutations:
                    if mut_type == 'F':
                        time = mutation[0]
                        remaining_time_points = time_points[time_points.
                                                            index(time):]
                        for time_point in remaining_time_points:
                            pop_time = pop_name + '_' + str(int(time_point))
                            if pop_time not in new_dict[gene_name]:
                                new_dict[gene_name][pop_time] = 1
                            else:
                                new_dict[gene_name][pop_time] += 1
                    elif mut_type == 'P':
                        pop_time = pop_name + '_' + str(int(mutation[0]))
                        if pop_time not in new_dict[gene_name]:
                            new_dict[gene_name][pop_time] = 1
                        else:
                            new_dict[gene_name][pop_time] += 1

        df = pd.DataFrame.from_dict(new_dict)
        df = df.fillna(0)
        df = df.loc[:, (df != 0).any(axis=0)]
        if mut_type == 'F':
            df_out = nt.get_path() + 'data/Good_et_al/gene_by_pop.txt'
            #df_delta_out = mydir + 'data/Good_et_al/gene_by_pop_delta.txt'
        elif mut_type == 'P':
            df_out = nt.get_path() + 'data/Good_et_al/gene_by_pop_poly.txt'
            #df_delta_out = mydir + 'data/Good_et_al/gene_by_pop_poly_delta.txt'
        else:
            print("Argument mut_type not recognized")
        df.to_csv(df_out, sep='\t', index=True)
Exemple #14
0
def plot_kmax_over_time():
    directory = nt.get_path() + '/data/Good_et_al/networks_BIC/'
    time_kmax = []
    for filename in os.listdir(directory):
        df = pd.read_csv(directory + filename,
                         sep='\t',
                         header='infer',
                         index_col=0)
        gens = filename.split('.')
        time = re.split('[_.]', filename)[1]
        time_kmax.append((int(time), max(df.astype(bool).sum(axis=0).values)))

    time_kmax_sorted = sorted(time_kmax, key=lambda tup: tup[0])
    x = [i[0] for i in time_kmax_sorted]
    y = [i[1] for i in time_kmax_sorted]
    x = np.log10(x)
    y = np.log10(y)

    #df_rndm_path = nt.get_path() + '/data/Good_et_al/networks_BIC_rndm.txt'
    #df_rndm = pd.read_csv(df_rndm_path, sep = '\t', header = 'infer')

    #x_rndm = np.log10(df_rndm.Generations.values)
    #y_rndm = np.log10(df_rndm.Generations.values)

    fig = plt.figure()
    #plt.scatter(x, y, marker = "o", edgecolors='none', c = '#175ac6', s = 120, zorder=3)
    plt.scatter(x, y, c='#175ac6', marker = 'o', s = 120, \
        edgecolors='#244162', linewidth = 0.6, alpha = 0.8, zorder=3)#, edgecolors='none')
    #plt.scatter(x_rndm, y_rndm, marker = "o", edgecolors='none', c = 'blue', s = 120, alpha = 0.1)
    '''using some code from ken locey, will cite later'''

    df = pd.DataFrame({'t': list(x)})
    df['kmax'] = list(y)
    f = smf.ols('kmax ~ t', df).fit()

    R2 = f.rsquared
    pval = f.pvalues
    intercept = f.params[0]
    slope = f.params[1]
    X = np.linspace(min(x), max(x), 1000)
    Y = f.predict(exog=dict(t=X))

    st, data, ss2 = summary_table(f, alpha=0.05)
    print(ss2)
    fittedvalues = data[:, 2]
    pred_mean_se = data[:, 3]
    pred_mean_ci_low, pred_mean_ci_upp = data[:, 4:6].T
    pred_ci_low, pred_ci_upp = data[:, 6:8].T

    slope_to_gamme = (1 / slope) + 1

    plt.fill_between(x,
                     pred_ci_low,
                     pred_ci_upp,
                     color='#175ac6',
                     lw=0.5,
                     alpha=0.2)
    #'$^\frac{1}{1 - '+str(round(slope_to_gamme,2))+'}$'
    #plt.text(2.4, 2.1, r'$k_{max}$'+ ' = '+str(round(10**intercept,2))+'*'+r'$t$'+ '$^{\frac{1}{1 - '+str(round(slope_to_gamme,2))+'}}$', fontsize=10, color='k', alpha=0.9)
    plt.text(2.4,
             2.05,
             r'$k_{max}$' + ' = ' + str(round(10**intercept, 2)) + '*' +
             r'$t^ \frac{1}{\,' + str(round(slope_to_gamme, 2)) + '- 1}$',
             fontsize=12,
             color='k',
             alpha=0.9)
    plt.text(2.4,
             1.94,
             r'$r^2$' + ' = ' + str("%.2f" % R2),
             fontsize=12,
             color='0.2')
    plt.plot(X.tolist(),
             Y.tolist(),
             '--',
             c='k',
             lw=2,
             alpha=0.8,
             color='k',
             label='Power-law')

    #plt.plot(t_x, t_y)
    plt.xlabel("Time (generations), " + r'$\mathrm{log}_{10}$', fontsize=18)
    plt.ylabel(r'$k_{max}, \;  \mathrm{log}_{10}$', fontsize=18)
    #plt.xscale('log')
    #plt.yscale('log')
    #plt.ylim(0.001, 1)

    fig.tight_layout()
    fig.savefig(nt.get_path() + '/figs/good_kmax_vs_time.png',
                bbox_inches="tight",
                pad_inches=0.4,
                dpi=600)
    plt.close()
def get_network_clustering_coefficients(dataset = 'good', kmax = True, reconstruct = 'naive'):
    if dataset == 'tenaillon':
        # df is a numpy matrix or pandas dataframe containing network interactions
        df_path = nt.get_path() + '/data/Tenaillon_et_al/network.txt'
        df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0)
        if kmax == False:
            df = df.drop('kpsD', axis=0)
            df = df.drop('kpsD', axis=1)
            df_out = open(nt.get_path() + '/data/Tenaillon_et_al/network_CCs_no_kmax.txt', 'w')
        else:
            df_out = open(nt.get_path() + '/data/Tenaillon_et_al/network_CCs.txt', 'w')
        df_out.write('\t'.join(['Gene', 'k_i', 'C_i']) + '\n')
        for index, row in df.iterrows():
            k_row = sum(i != 0 for i in row.values) - 1
            if (k_row == 0) or (k_row == 1):
                C_i = 0
            else:
                non_zero = row.nonzero()
                row_non_zero = row[non_zero[0]]
                # drop the node
                row_non_zero = row_non_zero.drop(labels = [index])
                L_i = 0
                for index_gene, gene in row_non_zero.iteritems():
                    row_non_zero_list = row_non_zero.index.tolist()
                    row_non_zero_list.remove(index_gene)
                    df_subset = df.loc[[index_gene]][row_non_zero_list]
                    L_i += sum(sum(i != 0 for i in df_subset.values))
                # we don't multiply L_i by a factor of 2 bc we're double counting edges
                C_i =  L_i  / (k_row * (k_row-1) )
            df_out.write('\t'.join([index, str(k_row), str(C_i)]) + '\n')
        df_out.close()

    elif dataset == 'good':
        if reconstruct == 'naive':
            directory = nt.get_path() + '/data/Good_et_al/networks_naive/'
            df_out = open(nt.get_path() + '/data/Good_et_al/network_naive_CCs.txt', 'w')
        elif reconstruct == 'BIC':
            directory = nt.get_path() + '/data/Good_et_al/networks_BIC/'
            df_out = open(nt.get_path() + '/data/Good_et_al/network_BIC_CCs.txt', 'w')

        df_out.write('\t'.join(['Generations', 'Gene', 'k_i', 'C_i']) + '\n')
        for filename in os.listdir(directory):
            if filename == '.DS_Store':
                continue
            df = pd.read_csv(directory + filename, sep = '\t', header = 'infer', index_col = 0)
            gens = filename.split('.')
            time = re.split('[_.]', filename)[1]
            print(time)
            for index, row in df.iterrows():
                k_row = sum(i != 0 for i in row.values) - 1
                if (k_row == 0) or (k_row == 1):
                    C_i = float(0)
                else:
                    non_zero = row.nonzero()
                    row_non_zero = row[non_zero[0]]
                    # drop the node
                    row_non_zero = row_non_zero.drop(labels = [index])
                    L_i = 0
                    for index_gene, gene in row_non_zero.iteritems():
                        row_non_zero_list = row_non_zero.index.tolist()
                        row_non_zero_list.remove(index_gene)
                        df_subset = df.loc[[index_gene]][row_non_zero_list]
                        L_i += sum(sum(i != 0 for i in df_subset.values))
                    # we don't multiply L_i by a factor of 2 bc we're double counting edges
                    C_i =  L_i  / (k_row * (k_row-1) )
                df_out.write('\t'.join([str(time), index, str(k_row), str(C_i)]) + '\n')

        df_out.close()
Exemple #16
0
def fig6():
    network_dir = nt.get_path() + '/data/Good_et_al/networks_naive/'
    #network_dir = nt.get_path() + '/data/Good_et_al/networks_BIC/'
    time_nodes = []
    time_kmax = []
    for filename in os.listdir(network_dir):
        if filename == '.DS_Store':
            continue
        df = pd.read_csv(network_dir + filename,
                         sep='\t',
                         header='infer',
                         index_col=0)
        gens = filename.split('.')
        time = re.split('[_.]', filename)[1]
        time_nodes.append((int(time), df.shape[0]))
        time_kmax.append((int(time), max(df.astype(bool).sum(axis=0).values)))

    fig = plt.figure()

    ax1 = plt.subplot2grid((2, 2), (0, 0), colspan=1)
    time_nodes_sorted = sorted(time_nodes, key=lambda tup: tup[0])
    x_nodes = [i[0] for i in time_nodes_sorted]
    y_nodes = [i[1] for i in time_nodes_sorted]
    ax1.scatter(x_nodes, y_nodes, marker = "o", edgecolors='#244162', \
        c = '#175ac6', s = 80, zorder=3, alpha = 0.6)
    ax1.set_xlabel("Time (generations)", fontsize=14)
    ax1.set_ylabel('Network size, ' + r'$N$', fontsize=14)
    ax1.set_ylim(5, 500)
    #ax1.set_yscale('log')

    ax2 = plt.subplot2grid((2, 2), (0, 1), colspan=1)
    time_kmax_sorted = sorted(time_kmax, key=lambda tup: tup[0])
    x_kmax = [i[0] for i in time_kmax_sorted]
    y_kmax = [i[1] for i in time_kmax_sorted]
    x_kmax = np.log10(x_kmax)
    y_kmax = np.log10(y_kmax)
    '''The below regression code is from the GitHub repository
    ScalingMicroBiodiversity and is licensed under a
    GNU General Public License v3.0.

    https://github.com/klocey/ScalingMicroBiodiversity
    '''
    df_regression = pd.DataFrame({'t': list(x_kmax)})
    df_regression['kmax'] = list(y_kmax)
    f = smf.ols('kmax ~ t', df_regression).fit()

    R2 = f.rsquared
    pval = f.pvalues
    intercept = f.params[0]
    slope = f.params[1]
    X = np.linspace(min(x_kmax), max(x_kmax), 1000)
    Y = f.predict(exog=dict(t=X))
    print(min(x_kmax), max(y_kmax))

    st, data, ss2 = summary_table(f, alpha=0.05)
    fittedvalues = data[:, 2]
    pred_mean_se = data[:, 3]
    pred_mean_ci_low, pred_mean_ci_upp = data[:, 4:6].T
    pred_ci_low, pred_ci_upp = data[:, 6:8].T

    slope_to_gamme = (1 / slope) + 1

    ax2.scatter([10**i for i in x_kmax], [10**i for i in y_kmax], c='#175ac6', marker = 'o', s = 80, \
        edgecolors='#244162', linewidth = 0.6, alpha = 0.6, zorder=1)#, edgecolors='none')
    ax2.fill_between([10**i for i in x_kmax], [10**i for i in pred_ci_low],
                     [10**i for i in pred_ci_upp],
                     color='#175ac6',
                     lw=0.5,
                     alpha=0.2,
                     zorder=2)
    ax2.text(250,
             100,
             r'$k_{max}$' + ' = ' + str(round(10**intercept, 2)) + '*' +
             r'$t^ \frac{1}{\,' + str(round(slope_to_gamme, 2)) + '- 1}$',
             fontsize=9,
             color='k',
             alpha=0.9)
    ax2.text(250,
             60,
             r'$r^2$' + ' = ' + str("%.2f" % R2),
             fontsize=9,
             color='0.2')
    ax2.plot([10**i for i in X.tolist()], [10**i for i in Y.tolist()],
             '--',
             c='k',
             lw=2,
             alpha=0.8,
             color='k',
             label='Power-law',
             zorder=2)
    ax2.set_xlabel("Time (generations)", fontsize=14)
    ax2.set_ylabel(r'$k_{max}$', fontsize=14)
    ax2.set_xscale('log')
    ax2.set_yscale('log')

    ax3 = plt.subplot2grid((2, 2), (1, 0), colspan=1)
    df_net_feats_path = nt.get_path(
    ) + '/data/Good_et_al/network_naive_features.txt'
    #df_net_feats_path = nt.get_path() + '/data/Good_et_al/network_naive_features.txt'
    df_net_feats = pd.read_csv(df_net_feats_path, sep='\t', header='infer')
    x_C = df_net_feats.N.values
    y_C = df_net_feats.C_mean.values

    ax3.scatter(x_C, y_C, c='#175ac6', marker = 'o', s = 80, \
        edgecolors='#244162', linewidth = 0.6, alpha = 0.6, zorder=1)
    x_C_range = list(range(10, max(x_C)))
    barabasi_albert_range_C = [((np.log(i)**2) / i) for i in x_C_range]
    random_range_c = [(1 / i) for i in x_C_range]

    x_C_sort = list(set(x_C.tolist()))
    x_C_sort.sort()
    model = nt.clusterBarabasiAlbert(x_C, y_C)
    b0_start = [0.01, 0.1, 1, 10]
    z_start = [-2, -0.5]
    results = []
    for b0 in b0_start:
        for z in z_start:
            start_params = [b0, z]
            result = model.fit(start_params=start_params)
            results.append(result)
    AICs = [result.aic for result in results]
    best = results[AICs.index(min(AICs))]
    best_CI_FIC = nt.CI_FIC(best)
    best_CI = best.conf_int()
    best_params = best.params

    barabasi_albert_range_C_ll = nt.cluster_BA(np.sort(x_C), best_params[0])

    ax3.plot(np.sort(x_C),
             barabasi_albert_range_C_ll,
             c='k',
             lw=2.5,
             ls='--',
             zorder=2)
    #plt.plot(x_C_range, random_range_c, c = 'r', lw = 2.5, ls = '--')
    ax3.set_xlabel('Network size, ' + r'$N$', fontsize=14)
    ax3.set_ylabel('Mean clustering \ncoefficient, ' +
                   r'$\left \langle C \right \rangle$',
                   fontsize=14)
    #ax3.set_xscale('log')
    ax3.set_yscale('log')
    ax3.set_ylim(0.05, 1.5)

    ax4 = plt.subplot2grid((2, 2), (1, 1), colspan=1)
    x_d = df_net_feats.N.values
    y_d = df_net_feats.d_mean.values
    ax4.scatter(x_d, y_d, c='#175ac6', marker = 'o', s = 80, \
        edgecolors='#244162', linewidth = 0.6, alpha = 0.6, zorder=1)
    #x_d_range = list(range(10, max(x_d)))
    #barabasi_albert_range_d = [ (np.log(i) / np.log(np.log(i))) for i in x_d_range ]
    x_d_sort = list(set(x_d.tolist()))
    x_d_sort.sort()
    model_d = nt.distanceBarabasiAlbert(x_d, y_d)
    results_d = []
    for b0 in b0_start:
        for z in z_start:
            start_params_d = [b0, z]
            result_d = model_d.fit(start_params=start_params_d)
            results_d.append(result_d)
    AICs_d = [result_d.aic for result_d in results_d]
    best_d = results_d[AICs_d.index(min(AICs_d))]
    best_CI_FIC_d = nt.CI_FIC(best_d)
    best_CI_d = best_d.conf_int()
    best_d_params = best_d.params

    barabasi_albert_range_d_ll = nt.distance_BA(np.sort(x_d), best_d_params[0])
    ax4.plot(np.sort(x_C),
             barabasi_albert_range_d_ll,
             c='k',
             lw=2.5,
             ls='--',
             zorder=2)
    #random_range = [ np.log(i) for i in x_d_range ]
    #ax4.plot(x_d_range, random_range, c = 'r', lw = 2.5, ls = '--')
    ax4.set_xlabel('Network size, ' + r'$N$', fontsize=14)
    ax4.set_ylabel('Mean distance, ' + r'$\left \langle d \right \rangle$',
                   fontsize=14)
    #ax4.set_xscale('log')

    plt.tight_layout()
    fig_name = nt.get_path() + '/figs/fig6.png'
    fig.savefig(fig_name, bbox_inches="tight", pad_inches=0.4, dpi=600)
    plt.close()
Exemple #17
0
def plot_end_network():
    network_path = nt.get_path(
    ) + '/data/Good_et_al/networks_naive/network_62750.txt'
    df = pd.read_csv(network_path, sep='\t', header='infer', index_col=0)
    print(df)
import networkx as nx
import pandas as pd
import network_tools as nt
#import matplotlib
#matplotlib.use('TkAgg')
import matplotlib.pyplot as plt

df = pd.read_csv(nt.get_path() + '/data/Good_et_al/networks_naive/network_55250.txt', index_col=0, sep = '\t')
df_values = df.values
G = nx.from_numpy_matrix(df_values)
#print(G)
#nx.draw(G)
#plt.savefig(nt.get_path() + '/figs/ntwrk_good.png', format="PNG")
Exemple #19
0
def fig4():
    df_path = nt.get_path() + '/data/Tenaillon_et_al/network.txt'
    df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)

    #df_C_path = nt.get_path() + '/data/Tenaillon_et_al/network_CCs.txt'
    df_C_path = nt.get_path() + '/data/Tenaillon_et_al/network_CCs_no_kmax.txt'
    df_C = pd.read_csv(df_C_path, sep='\t', header='infer', index_col=0)
    kmax_df = max(df_C.k_i.values)
    mean_C_df = np.mean(df_C.loc[df_C['k_i'] >= 2].C_i.values)
    df_null_path = nt.get_path() + '/data/Tenaillon_et_al/permute_network.txt'
    df_null = pd.read_csv(df_null_path, sep='\t', header='infer', index_col=0)

    df_no_max = df.copy()
    df_no_max = df_no_max.drop('kpsD', axis=0)
    df_no_max = df_no_max.drop('kpsD', axis=1)
    #dist_df = nt.networkx_distance(df)
    dist_df = nt.networkx_distance(df_no_max)

    C_mean_null = df_null.C_mean_no1or2.tolist()
    C_mean_null = [x for x in C_mean_null if str(x) != 'nan']
    d_mean_null = df_null.d_mean.tolist()
    k_max_null = df_null.k_max.tolist()

    fig = plt.figure()
    ax1 = plt.subplot2grid((2, 2), (0, 0), colspan=1)
    k_list = []
    for index, row in df.iterrows():
        k_row = sum(i != 0 for i in row.values) - 1
        if k_row > 0:
            k_list.append(k_row)

    count_k_list = Counter(k_list)
    count_k_list_sum = sum(count_k_list.values())
    count_k_list_x = list(count_k_list.keys())
    count_k_list_y = [(i / count_k_list_sum) for i in count_k_list.values()]

    k_list_no_max = []
    for index, row in df_no_max.iterrows():
        k_row = sum(i != 0 for i in row.values) - 1
        if k_row > 0:
            k_list_no_max.append(k_row)

    count_k_list_no_max = Counter(k_list_no_max)
    count_k_list_sum_no_max = sum(count_k_list_no_max.values())
    count_k_list_x_no_max = list(count_k_list_no_max.keys())
    count_k_list_y_no_max = [(i / count_k_list_sum_no_max)
                             for i in count_k_list_no_max.values()]

    ax1.scatter(count_k_list_x,
                count_k_list_y,
                marker="o",
                edgecolors='#244162',
                c='#175ac6',
                alpha=0.4,
                s=60,
                zorder=4)
    # red colors
    # edge #C92525
    # c #FF4343
    #ax1.scatter(count_k_list_x_no_max, count_k_list_y_no_max, marker = "o", edgecolors='#C92525', c = '#FF4343', alpha = 0.4, s = 60, zorder=4)

    count_k_list_x.sort()
    #m = 0.56086623
    #pred_y = [ ((2 * m * (m+1)) / (j * (j+1) * (j+2) )) for j in count_k_list_x ]
    #ax1.plot(count_k_list_x, pred_y, c = 'k', lw = 2.5,
    #    ls = '--', zorder=2)
    p = sum(k_list) / (((df.shape[0]) * (df.shape[0] - 1)) / 2)
    p_no_max = sum([i for i in k_list if i != 181]) / (((df.shape[0] - 1) *
                                                        (df.shape[0] - 2)) / 2)

    binom_x = np.arange(0, max(k_list))
    binom_y = binom.pmf(binom_x, df.shape[0] - 1, p)
    binom_y_noMax = binom.pmf(binom_x, df.shape[0] - 2, p_no_max)

    ax1.plot(binom_x, binom_y_noMax, c='k', lw=2.5, ls='--', zorder=2)
    ax1.set_xlim([0.5, 400])
    ax1.set_ylim([0.001, 1])
    ax1.set_xscale('log')
    ax1.set_yscale('log')
    ax1.set_xlabel(r'$k_{i}$', fontsize=14)
    ax1.set_ylabel("Frequency", fontsize=14)

    ax2 = plt.subplot2grid((2, 2), (0, 1), colspan=1)
    ax2.hist(k_max_null,
             bins=30,
             weights=np.zeros_like(k_max_null) + 1. / len(k_max_null),
             alpha=0.8,
             color='#175ac6')
    #ax2.axvline(max(k_list_no_max), color = 'red', lw = 2, ls = ':')
    ax2.axvline(max(k_list_no_max), color='red', lw=2, ls='--')
    #ax2.axvline(kmax_df, color = 'red', lw = 2, ls = '--')
    #ax2.set_xscale('log')
    ax2.set_xlabel(r'$k_{max}$', fontsize=14)
    ax2.set_ylabel("Frequency", fontsize=14)

    k_max_null.append(kmax_df)
    relative_position_k_max = sorted(k_max_null).index(kmax_df) / (
        len(k_max_null) - 1)
    if relative_position_k_max > 0.5:
        p_score_k_max = 1 - relative_position_k_max
    else:
        p_score_k_max = relative_position_k_max
    print('kmax p-score = ' + str(round(p_score_k_max, 3)))
    #ax2.text(0.366, 0.088, r'$p < 0.05$', fontsize = 10)

    ax3 = plt.subplot2grid((2, 2), (1, 0), colspan=1)
    #print(C_mean_null)
    ax3.hist(C_mean_null,
             bins=30,
             weights=np.zeros_like(C_mean_null) + 1. / len(C_mean_null),
             alpha=0.8,
             color='#175ac6')
    ax3.axvline(mean_C_df, color='red', lw=2, ls='--')
    #ax3.set_xlabel("Mean clustering coefficient", fontsize = 14)
    ax3.set_xlabel('Mean clustering coefficient, ' +
                   r'$\left \langle C \right \rangle$',
                   fontsize=14)
    ax3.set_ylabel("Frequency", fontsize=14)

    C_mean_null.append(mean_C_df)
    relative_position_mean_C = sorted(C_mean_null).index(mean_C_df) / (
        len(C_mean_null) - 1)
    if relative_position_mean_C > 0.5:
        p_score_mean_C = 1 - relative_position_mean_C
    else:
        p_score_mean_C = relative_position_mean_C
    print('mean C p-score = ' + str(round(p_score_mean_C, 3)))
    #ax3.text(0.078, 0.115, r'$p < 0.05$', fontsize = 10)

    ax4 = plt.subplot2grid((2, 2), (1, 1), colspan=1)
    ax4.hist(d_mean_null,
             bins=30,
             weights=np.zeros_like(d_mean_null) + 1. / len(d_mean_null),
             alpha=0.8,
             color='#175ac6')
    ax4.axvline(dist_df, color='red', lw=2, ls='--')
    #ax4.set_xlabel("Mean distance", fontsize = 14)
    ax4.set_xlabel('Mean distance, ' + r'$\left \langle d \right \rangle$',
                   fontsize=14)
    ax4.set_ylabel("Frequency", fontsize=14)

    d_mean_null.append(dist_df)
    relative_position_d_mean = sorted(d_mean_null).index(dist_df) / (
        len(d_mean_null) - 1)
    if relative_position_d_mean > 0.5:
        p_score_d_mean = 1 - relative_position_d_mean
    else:
        p_score_d_mean = relative_position_d_mean
    print('mean pairwise distance p-score = ' + str(round(p_score_d_mean, 3)))
    #ax4.text(89.1, 0.09, r'$p \nless  0.05$', fontsize = 10)

    plt.tight_layout()
    fig_name = nt.get_path() + '/figs/fig4.png'
    fig.savefig(fig_name, bbox_inches="tight", pad_inches=0.4, dpi=600)
    plt.close()