def calc_betweenness_centrality(G, cmap):
    # this function calculates the betweenness centrality of each node, and returns colors corresponding to these values
    local_BC = nx.betweenness_centrality(G)
    local_BC_scale = [
        round(local_BC[key] * float(255)) for key in local_BC.keys()
    ]
    local_BC_scale = pd_Series(local_BC_scale, index=G.nodes())
    rfrac = [cmap(int(x))[0] for x in local_BC_scale]
    gfrac = [cmap(int(x))[1] for x in local_BC_scale]
    bfrac = [cmap(int(x))[2] for x in local_BC_scale]
    rfrac = pd_Series(rfrac, index=G.nodes())
    gfrac = pd_Series(gfrac, index=G.nodes())
    bfrac = pd_Series(bfrac, index=G.nodes())

    return rfrac, gfrac, bfrac
def calc_clustering_coefficient(G, cmap):
    # this function calculates the clustering coefficient of each node, and returns colors corresponding to these values
    local_CC = nx.clustering(G, G.nodes())
    local_CC_scale = [
        round(local_CC[key] * float(255)) for key in local_CC.keys()
    ]
    local_CC_scale = pd_Series(local_CC_scale, index=G.nodes())
    rfrac = [cmap(int(x))[0] for x in local_CC_scale]
    gfrac = [cmap(int(x))[1] for x in local_CC_scale]
    bfrac = [cmap(int(x))[2] for x in local_CC_scale]
    rfrac = pd_Series(rfrac, index=G.nodes())
    gfrac = pd_Series(gfrac, index=G.nodes())
    bfrac = pd_Series(bfrac, index=G.nodes())

    return rfrac, gfrac, bfrac
Exemple #3
0
def updateScore(csvfile, score):
    """ Add or update score column and reorder """
    import string
    head, rows = read_csv(csvfile)
    data = pd_read_csv(csvfile)
    data.index = data.index + 1
    cols = data.columns.tolist()
    sco = pd_Series(np_zeros(len(data[cols[0]])), index=data.index)
    if 'Score' not in cols:
        data['Score'] = sco
        cols = ['Score'] + cols
        data = data[cols]
    colk = list(string.ascii_uppercase)
    for sc in score:
        try:
            coln = colk.index(sc[0])
            val = sc[2]
            checked = sc[3]
            if checked:
                sco += val * data.iloc[:, coln]
        except:
            continue
    data['Score'] = sco
    data = data.sort_values('Score', ascending=False)
    updateMSA(os_path.dirname(csvfile), [[v] for v in data['Seq. ID']])
    data = data.reset_index(drop=True)
    data.index = data.index + 1
    data.rename_axis('Select', axis="columns")
    data.to_csv(csvfile, quoting=csv_QUOTE_ALL, index=False)
    return data
def translate_episode_data(episode_data):
    """
    Convert episode data into data that
    can be used in a graph.

    Given data from multiple episodes make
    it such that it can be plotted by tsplot,
    i.e. the mean plus the confidence bounds.
    """
    times, units, values = [], [], []
    for index, (ep_len, ep_rew) in enumerate(episode_data):
        # Smooth out the data
        ep_rew = pd_Series(ep_rew).ewm(span=1000).mean()
        # sample for faster plotting
        x, y = sample(bins=np_linspace(0, MAX_TSTEPS, NSAMPLES + 1),
                      time=ep_len,
                      value=ep_rew)
        # Convert to tsplot format
        times.extend(x)
        values.extend(y)
        units.extend([index] * len(x))
    return pd_DataFrame({
        'Frame': times,
        'run_id': units,
        'Average Episode Reward': values
    })
def calc_community_fraction(G, to_nodes, from_nodes, from_nodes_partition,
                            color_list):
    # set color to most populous community
    degree = G.degree(to_nodes)
    rfrac, gfrac, bfrac = pd_Series(index=G.nodes()), pd_Series(
        index=G.nodes()), pd_Series(index=G.nodes())
    for t in to_nodes:
        t_neighbors = G.neighbors(t)
        t_comms = [from_nodes_partition[i] for i in t_neighbors]
        t_comms = pd_Series(t_comms)

        unique_comms = t_comms.unique()
        num_unique_comms = len(unique_comms)

        num_n = pd_Series(index=unique_comms)
        for n in unique_comms:
            num_n[n] = sum(t_comms == n)

        # find max num_n
        color_max = color_list[num_n.argmax()][0:3]

        # how much is shared by other colors?
        #print(num_n)
        frac_shared = 1 - np.max(num_n) / np.sum(num_n)

        # darken the color by this amount
        #color_dark = shade_color(color_max,-frac_shared*100)
        color_dark = (color_max[0] * (1 - frac_shared),
                      color_max[1] * (1 - frac_shared),
                      color_max[2] * (1 - frac_shared))

        rfrac[t] = color_dark[0]
        gfrac[t] = color_dark[1]
        bfrac[t] = color_dark[2]

    # fill in the from_nodes colors
    for f in from_nodes:
        f_group = from_nodes_partition[f]
        rfrac[f] = color_list[f_group][0]
        gfrac[f] = color_list[f_group][1]
        bfrac[f] = color_list[f_group][2]

    return rfrac, gfrac, bfrac
def save_gene_gene_json(input_file_name,
                        out_file_start,
                        cluster_id,
                        color_type='clustering_coefficient',
                        colormap='OrRd'):
    '''
    this function takes a processed cluster file ('input_file_name': output of process clustering results in cluster_analysis_module)
    and saves a json file for every community in the file, starting with 'out_file_start'.  'input_file_name' and 'out_file_start'
    should be prepended with location.
    '''

    # first load in a network (edge list)
    edge_list_df = pd_read_csv(input_file_name, sep='\t')

    group_ids = np.unique(edge_list_df['group_id'])

    for focal_group in group_ids:
        print focal_group

        save_file_name = out_file_start + '_' + str(int(focal_group)) + '.json'

        idx_group = (edge_list_df['group_id'] == focal_group)
        idx_group = list(edge_list_df['group_id'][idx_group].index)

        # make a network out of it

        #edge_list = [(edge_list_df['var1'][i], edge_list_df['var2'][i], np.abs(edge_list_df['corr'][i])) for i in idx_group if edge_list_df['corr'][i] !=0]
        edge_list = [(edge_list_df['var1'][i], edge_list_df['var2'][i],
                      edge_list_df['corr'][i]) for i in idx_group
                     if edge_list_df['corr'][i] != 0]

        Gsmall = nx.Graph()
        Gsmall.add_weighted_edges_from(edge_list)

        nodes = Gsmall.nodes()
        numnodes = len(nodes)
        edges = Gsmall.edges(data=True)
        numedges = len(edges)
        if color_type == 'community':
            partition = community.best_partition(Gsmall)

            partition = pd_Series(partition)
            col_temp = partition[Gsmall.nodes()]

            # Set up json for saving
            # what should the colors be??
            num_communities = len(np.unique(col_temp))
            color_list = plt.cm.gist_rainbow(np.linspace(
                0, 1, num_communities))

            # blend the community colors (so that to-nodes are a mixture of all the communities they belong to)
            rfrac, gfrac, bfrac = calc_community_fraction(
                Gsmall, Gsmall.nodes(), Gsmall.nodes(), partition, color_list)

            #nodes_dict = [{"id":n,"com":col_temp[n],"degree":author_gene_bp.degree(n)} for n in nodes]
            nodes_dict = [{
                "id": n,
                "com": col_temp[n],
                "degree": Gsmall.degree(n),
                "rfrac": rfrac[n] * 255,
                "gfrac": gfrac[n] * 255,
                "bfrac": bfrac[n] * 255
            } for n in nodes]
        elif color_type == 'clustering_coefficient':
            cmap = plt.get_cmap(colormap)
            rfrac, gfrac, bfrac = calc_clustering_coefficient(Gsmall, cmap)
            nodes_dict = [{
                "id": n,
                "com": 0,
                "degree": Gsmall.degree(n),
                "rfrac": rfrac[n] * 255,
                "gfrac": gfrac[n] * 255,
                "bfrac": bfrac[n] * 255
            } for n in nodes]

        elif color_type == 'betweenness_centrality':
            cmap = plt.get_cmap(colormap)
            rfrac, gfrac, bfrac = calc_betweenness_centrality(Gsmall, cmap)
            nodes_dict = [{
                "id": n,
                "com": 0,
                "degree": Gsmall.degree(n),
                "rfrac": rfrac[n] * 255,
                "gfrac": gfrac[n] * 255,
                "bfrac": bfrac[n] * 255
            } for n in nodes]

        # save network in json format
        #nodes = Gsmall.nodes()
        #numnodes = len(nodes)
        #edges=Gsmall.edges(data=True)
        #numedges = len(edges)
        #nodes_dict = [{"id":n,"com":col_temp[n],"degree":author_gene_bp.degree(n)} for n in nodes]
        #nodes_dict = [{"id":n,"com":col_temp[n],"degree":Gsmall.degree(n),
        #              "rfrac":rfrac[n]*255,"gfrac":gfrac[n]*255,"bfrac":bfrac[n]*255} for n in nodes]

        node_map = dict(
            zip(nodes,
                range(numnodes)))  # map to indices for source/target in edges
        edges_dict = [{
            "source": node_map[edges[i][0]],
            "target": node_map[edges[i][1]],
            "weight": edges[i][2]['weight']
        } for i in range(numedges)]

        import json
        json_graph = {
            "directed": False,
            "nodes": nodes_dict,
            "links": edges_dict
        }

        client = pymongo.MongoClient()
        db = client.cache

        heat_maps = db.heat_map_graph

        a = {
            'clusterId': 'cluster' + str(int(focal_group)),
            'heat_map': json.dumps(json_graph)  #heat_map_ordered_transposed
        }
def analyze_AG_bipartite_network(genes,
                                 authors_GB_genes,
                                 pub_thresh=1,
                                 save_file_name="author_gene_bp.json",
                                 plot_flag=False):
    gene_list = genes.split(',')

    t0 = time.time()

    # unpickle groupby object
    #authors_GB_genes = pd.read_pickle(author_gene_GB_fname)
    authors_GB_genes = app.authors_GB_genes_loaded

    # get rid of invalid genes in gene_list
    new_gene_list = []
    for gene in gene_list:
        if gene in authors_GB_genes:
            new_gene_list.append(gene)

    gene_list = new_gene_list

    # create list of all authors/weights who have published on at least one gene in gene_list
    AW_list_total = []
    for gene in gene_list:
        AW_list_total.extend(list(authors_GB_genes[gene].index))
    AW_list_total = zip(*AW_list_total)

    author_list_total = AW_list_total[0]
    weight_list_total = AW_list_total[1]

    print(time.time() - t0)

    author_list_total = pd_Series(author_list_total)
    weight_list_total = pd_Series(weight_list_total, index=author_list_total)

    # take the mean of duplicate entries
    df_temp = pd_DataFrame(
        {
            'weight': list(weight_list_total),
            'author': list(author_list_total)
        },
        index=range(len(author_list_total)))
    AW_gb_temp = df_temp.weight.groupby(df_temp['author']).mean()

    author_list_total = list(AW_gb_temp.index)
    weight_list_total = list(AW_gb_temp.values)
    weight_list_total = pd_Series(weight_list_total, index=author_list_total)

    # make a dataframe, indexed by authors in author_list_total, with columns = entries in gene_list
    author_gene_df = pd_DataFrame(np.zeros(
        [len(author_list_total), len(gene_list)]),
                                  index=author_list_total,
                                  columns=gene_list)

    print(time.time() - t0)

    # fill in the dataframe
    for gene in gene_list:
        #print(gene)
        temp = list(authors_GB_genes[gene].index)
        temp = zip(*temp)
        authors_temp = list(np.unique(temp[0]))
        author_gene_df[gene][authors_temp] = weight_list_total[authors_temp]

    print(time.time() - t0)

    # add a column for total weight
    author_gene_df['total_weight'] = np.sum(np.array(author_gene_df), 1)

    author_gene_df.sort('total_weight', inplace=True, ascending=False)

    # next, convert this dataframe into bipartite network
    # make the small bipartite graph
    author_gene_bp = nx.Graph()

    # pick out authors which have published on > pub_thresh genes in gene_list
    index_temp = list(author_gene_df['total_weight'][
        author_gene_df['total_weight'] > pub_thresh].index)

    # only allow 200 authors max
    if len(index_temp) > 200:
        author_nodes = index_temp[0:200]
    else:
        author_nodes = index_temp
    #index_temp = list(author_gene_df['total_num'].index)
    #author_nodes = index_temp[0:num_authors]

    print(time.time() - t0)

    for gene in gene_list:
        for author in author_nodes:
            # only add a link if connection exists
            if author_gene_df[gene][author] > 0:
                author_gene_bp.add_edge(gene, author)

    # add all genes in gene_list in case none of them come up
    author_gene_bp.add_nodes_from(gene_list)

    # now apply clustering algo to the bipartite graph
    partition = community.best_partition(author_gene_bp)
    partition = pd_Series(partition)
    col_temp_authors = partition[author_nodes]
    col_temp_genes = partition[gene_list]
    col_temp = partition[author_gene_bp.nodes()]

    if plot_flag:
        # plot graph if plot_flag = True
        plt.figure(figsize=[15, 15])
        pos = nx.spring_layout(author_gene_bp, k=.3)
        #nx.draw(author_gene_bp,pos=pos,alpha=.5,node_size=100,node_color = col_temp,cmap='Paired')

        gene_list = list(gene_list)
        nx.draw_networkx_nodes(author_gene_bp,
                               nodelist=author_nodes,
                               node_color=col_temp_authors,
                               cmap='Paired',
                               pos=pos,
                               alpha=.5,
                               node_size=100)
        nx.draw_networkx_nodes(author_gene_bp,
                               nodelist=gene_list,
                               node_color=col_temp_genes,
                               cmap='Paired',
                               pos=pos,
                               alpha=.5,
                               node_size=200,
                               node_shape='s')
        nx.draw_networkx_edges(author_gene_bp, pos=pos, alpha=.1)
        node_subset_dict = dict(zip(index_temp[0:20], index_temp[0:20]))
        gene_subset_dict = dict(zip(gene_list, gene_list))
        temp = node_subset_dict.update(gene_subset_dict)
        nx.draw_networkx_labels(author_gene_bp,
                                pos=pos,
                                labels=node_subset_dict)

    # Set up json for saving
    # what should the colors be??
    num_communities = len(np.unique(col_temp))
    color_list = plt.cm.gist_rainbow(np.linspace(0, 1, num_communities))

    # blend the community colors (so that to-nodes are a mixture of all the communities they belong to)
    rfrac, gfrac, bfrac = calc_community_fraction(author_gene_bp, author_nodes,
                                                  gene_list, partition,
                                                  color_list)

    # save network in json format
    nodes = author_gene_bp.nodes()
    numnodes = len(nodes)
    edges = author_gene_bp.edges()
    numedges = len(edges)
    #nodes_dict = [{"id":n,"com":col_temp[n],"degree":author_gene_bp.degree(n)} for n in nodes]
    nodes_dict = [{
        "id": n,
        "com": col_temp[n],
        "degree": author_gene_bp.degree(n),
        "rfrac": rfrac[n] * 255,
        "gfrac": gfrac[n] * 255,
        "bfrac": bfrac[n] * 255
    } for n in nodes]
    node_map = dict(zip(
        nodes, range(numnodes)))  # map to indices for source/target in edges
    edges_dict = [{
        "source": node_map[edges[i][0]],
        "target": node_map[edges[i][1]]
    } for i in range(numedges)]

    #import json
    json_graph = {"directed": False, "nodes": nodes_dict, "links": edges_dict}
    #json.dump(json_graph,open(save_file_name,'w'))

    print(time.time() - t0)

    return json_graph
Exemple #8
0
def get_most_similar_coordinates(word,
                                 count=10,
                                 pos='any',
                                 remove_stop=True):  ##
    word = word.lower()
    word_vectors = book2vec.wv

    try:
        word_seq = word_vectors.most_similar(word,
                                             topn=len(word_vectors.vocab))
        sim_words = [x[0] for x in word_seq]
        if (remove_stop):
            #from nltk.corpus import stopwords
            #nltk.download("stopwords")
            #stop_words = set(stopwords.words('english'))
            stop_words = {
                'a', 'about', 'above', 'after', 'again', 'against', 'ain',
                'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as',
                'at', 'be', 'because', 'been', 'before', 'being', 'below',
                'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't",
                'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't",
                'doing', 'don', "don't", 'down', 'during', 'each', 'few',
                'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has',
                'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he',
                'her', 'here', 'hers', 'herself', 'him', 'himself', 'his',
                'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it',
                "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me',
                'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my',
                'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o',
                'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our',
                'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same',
                'shan', "shan't", 'she', "she's", 'should', "should've",
                'shouldn', "shouldn't", 'so', 'some', 'such', 't', 'than',
                'that', "that'll", 'the', 'their', 'theirs', 'them',
                'themselves', 'then', 'there', 'these', 'they', 'this',
                'those', 'through', 'to', 'too', 'under', 'until', 'up', 've',
                'very', 'was', 'wasn', "wasn't", 'we', 'were', 'weren',
                "weren't", 'what', 'when', 'where', 'which', 'while', 'who',
                'whom', 'why', 'will', 'with', 'won', "won't", 'wouldn',
                "wouldn't", 'y', 'you', "you'd", "you'll", "you're", "you've",
                'your', 'yours', 'yourself', 'yourselves'
            }
            without_stop = [x for x in sim_words if x not in stop_words]
            sim_words = without_stop

        slice2 = points
        if pos != 'any':
            slice2 = points.loc[points['attr'].isin([pos])]
        attr_words = slice2['word'].tolist()
        res_words = [x for x in sim_words if x in attr_words]
        res_words = res_words[:count]

        slice = points.loc[points['word'].isin(res_words)]

        slice = slice.append(points.loc[points['word'] == word])
        slice['sim_score'] = pd_Series(1.0, index=slice.index)
        slice['word_count'] = pd_Series(0, index=slice.index)
        for i, point in slice.iterrows():
            slice.at[i, 'word_count'] = get_word_count(point.word,
                                                       word_vectors)
            for (w, s) in word_seq:
                if w == point.word:
                    slice.at[i, 'sim_score'] = round(s, 3)
                    break

        retVal = slice
    except:
        retVal = None
    return retVal