def add_degree_features(G: DiGraph, df: pd.DataFrame) -> pd.DataFrame:
    source_in_degree = []
    source_out_degree = []
    source_bi_degree = []
    source_nbrs = []

    sink_in_degree = []
    sink_out_degree = []
    sink_bi_degree = []
    sink_nbrs = []

    common_neighbors = []
    total_neighbors = []
    transitive_links = []

    JC_predecessors = []
    JC_successors = []
    JC_transient_in = []
    JC_transient_out = []
    JC_neighbors = []

    cos_predecessors = []
    cos_successors = []
    cos_transient_in = []
    cos_transient_out = []
    cos_neighbors = []

    PA_predecessors = []
    PA_successors = []
    PA_transient_in = []
    PA_transient_out = []
    PA_neighbors = []

    RA_predecessors = []
    RA_successors = []
    RA_transient_in = []
    RA_transient_out = []
    RA_neighbors = []

    AA_predecessors = []
    AA_successors = []
    AA_transient_in = []
    AA_transient_out = []
    AA_neighbors = []

    hub_promoted_index = []
    hub_suppressed_index = []

    for i, row in tqdm(df.iterrows()):
        source, sink = row["edge"]
        try:
            s_in = set(G.predecessors(source))
            s_out = set(G.successors(source))
            s_bi = set(s_in.intersection(s_out))
            s_nbrs = set(s_in.union(s_out))
        except:
            s_in = set()
            s_out = set()
            s_bi = set()
            s_nbrs = set()
        try:
            d_in = set(G.predecessors(sink))
            d_out = set(G.successors(sink))
            d_bi = set(d_in.intersection(d_out))
            d_nbrs = set(d_in.union(d_out))
        except:
            d_in = set()
            d_out = set()
            d_bi = set()
            d_nbrs = set()

        source_in_degree.append(len(s_in))
        source_out_degree.append(len(s_out))
        source_bi_degree.append(len(s_bi))
        source_nbrs.append(len(s_nbrs))

        sink_in_degree.append(len(d_in))
        sink_out_degree.append(len(d_out))
        sink_bi_degree.append(len(d_bi))
        sink_nbrs.append(len(d_nbrs))

        common = len(s_nbrs.intersection(d_nbrs))
        common_neighbors.append(common)
        total_neighbors.append(len(s_nbrs.union(d_nbrs)))
        transitive_links.append(len(s_out.intersection(d_in)))

        JC_predecessors.append(jaccard_coeff(s_in, d_in))
        JC_successors.append(jaccard_coeff(s_out, d_out))
        JC_transient_in.append(jaccard_coeff(s_out, d_in))
        JC_transient_out.append(jaccard_coeff(s_in, d_out))
        JC_neighbors.append(jaccard_coeff(s_nbrs, d_nbrs))

        cos_predecessors.append(cosine_distance(s_in, d_in))
        cos_successors.append(cosine_distance(s_out, d_out))
        cos_transient_in.append(cosine_distance(s_out, d_in))
        cos_transient_out.append(cosine_distance(s_in, d_out))
        cos_neighbors.append(cosine_distance(s_nbrs, d_nbrs))

        PA_predecessors.append(preferential_attachment(s_in, d_in))
        PA_successors.append(preferential_attachment(s_out, d_out))
        PA_transient_in.append(preferential_attachment(s_out, d_in))
        PA_transient_out.append(preferential_attachment(s_in, d_out))
        PA_neighbors.append(preferential_attachment(s_nbrs, d_nbrs))

        RA_predecessors.append(directed_resource_allocation(s_in, d_in, G))
        RA_successors.append(directed_resource_allocation(s_out, d_out, G))
        RA_transient_in.append(directed_resource_allocation(s_out, d_in, G))
        RA_transient_out.append(directed_resource_allocation(s_in, d_out, G))
        RA_neighbors.append(directed_resource_allocation(s_nbrs, d_nbrs, G))

        AA_predecessors.append(directed_adamic_adar(s_in, d_in, G))
        AA_successors.append(directed_adamic_adar(s_out, d_out, G))
        AA_transient_in.append(directed_adamic_adar(s_out, d_in, G))
        AA_transient_out.append(directed_adamic_adar(s_in, d_out, G))
        AA_neighbors.append(directed_adamic_adar(s_nbrs, d_nbrs, G))

        try:
            hub_promoted_index.append(common / min([len(s_nbrs), len(d_nbrs)]))
        except:
            hub_promoted_index.append(0.0)
        try:
            hub_suppressed_index.append(
                common / max([len(s_nbrs), len(d_nbrs)]))
        except:
            hub_suppressed_index.append(0.0)

    df = pd.DataFrame({
        "edge": df.edge,
        "source_in_degree": source_in_degree,
        "source_out_degree": source_out_degree,
        "source_bi_degree": source_bi_degree,
        "source_neighbors": source_nbrs,
        "sink_in_degree": sink_in_degree,
        "sink_out_degree": sink_out_degree,
        "sink_bi_degree": sink_out_degree,
        "sink_neighbors": sink_nbrs,
        "common_neighbors": common_neighbors,
        "total_neighbors": total_neighbors,
        "transitive_links": transitive_links,
        "JC_predecessors": JC_predecessors,
        "JC_successors": JC_successors,
        "JC_transient_in": JC_transient_in,
        "JC_transient_out": JC_transient_out,
        "JC_neighbors": JC_neighbors,
        "cos_predecessors": cos_predecessors,
        "cos_successors": cos_successors,
        "cos_transient_in": cos_transient_in,
        "cos_transient_out": cos_transient_out,
        "cos_neighbors": cos_neighbors,
        "PA_predecessors": PA_predecessors,
        "PA_successors": PA_successors,
        "PA_transient_in": PA_transient_in,
        "PA_transient_out": PA_transient_out,
        "PA_neighbors": PA_neighbors,
        "RA_predecessors": RA_predecessors,
        "RA_successors": RA_successors,
        "RA_transient_in": RA_transient_in,
        "RA_transient_out": RA_transient_out,
        "RA_neighbors": RA_neighbors,
        "AA_predecessors": AA_predecessors,
        "AA_successors": AA_successors,
        "AA_transient_in": AA_transient_in,
        "AA_transient_out": AA_transient_out,
        "AA_neighbors": AA_neighbors,
        "hub_promoted_index": hub_promoted_index,
        "hub_suppressed_index": hub_suppressed_index,
    })

    # Other indices
    df["sorensen_index"] = 2 * (
        df["common_neighbors"] /
        (df["source_neighbors"] + df["sink_neighbors"]))
    df["LHN_index"] = df["common_neighbors"] / (df["source_neighbors"] *
                                                df["sink_neighbors"])

    # Calculate degree densities
    df["source_in_density"] = df["source_in_degree"] / df["source_neighbors"]
    df["source_out_density"] = df["source_out_degree"] / df["source_neighbors"]
    df["source_bi_density"] = df["source_bi_degree"] / df["source_neighbors"]

    df["sink_in_density"] = df["sink_in_degree"] / df["sink_neighbors"]
    df["sink_out_density"] = df["sink_out_degree"] / df["sink_neighbors"]
    df["sink_bi_density"] = df["sink_bi_degree"] / df["sink_neighbors"]

    return df
Beispiel #2
0
def tree0(weight_value, startwindow, term):

    print 'start window:', startwindow
    # windowGraph = {}
    cliqueGraph = DiGraph()
    dic_term = {}
    dic_last_time = {}
    dic_temp = {}
    dic_term_num = {}
    dic_intersect_level = {}
    # term = 183
    
    root = 0
    cliqueGraph.add_node(root, annotation='root', windowsize='root', weight_value='root')
    w = data.shape[1]
    i = 0
    q = 0
    
    for window in range(startwindow, w):
        dic_intersect_level.clear()
        #print window ## mine
        if window == startwindow:
            

            for clique in find_cliques(windowGraph[window]):
                if len(clique) >size_clique:
                    cliqueGraph.add_node(term, annotation=list(clique), windowsize=[window],
                                         weight=weight_value)  # generate a term
                    cliqueGraph.add_edge(root, term)
                    dic_term[frozenset(clique)] = [window]  # dic_term 记录 window和clique or Dic_term records window and clique
                    dic_term_num[frozenset(clique)] = term  # dic_term_num 记录 term 序号和clique or Dic_term_num record term number and clique
                    dic_last_time[frozenset(clique)] = [window]  # dic_last_time   记录上一时刻生成的交集 用于下一时刻的比较 or Dic_last_time records the intersection generated at the last moment for comparison at the next moment
                    term = term + 1
                    print 'for start window '
                else:
                    continue
                    # print len(dic_last_time), len(dic_term), cliqueGraph.number_of_nodes()

        else:

            for clique in find_cliques(windowGraph[window]):
                if len(clique) > size_clique:
                    #print window, 'clique:', clique ## mine

                    for key, value in dic_last_time.items():  # key 是clique ,value是 [window] or Key is clique, value is [window]
                        intersect = sorted(set(key).intersection(set(clique)))
                        q = 0
                        # if len(intersect) >=  size_clique:
                        if len(intersect) >= size_clique:
                            #print 'intersect', intersect
                            # 同一层判断交集之间是否有重复的父子关系。 每生成一个交集, 判断当前层的其他term和交集的关系。or The same layer determines whether there are 
                            #duplicate parent-child relationships between intersections. Each generation of an intersection determines the relationship 
                            #between other terms and intersections of the current layer.
                            for ik, iv in dic_intersect_level.items():
                                if set(intersect) == (set(ik)):  # 生成一模一样的交集 or Generate exactly the same intersection
                                    # 判断两个的编号是否一样?or Is the two numbers the same?
                                    if dic_term_num[frozenset(key)] != dic_term_num[frozenset(ik)]:
                                        cliqueGraph.add_edge(dic_term_num[frozenset(key)], dic_term_num[frozenset(ik)])
                                    q = 1
                                    break
                                elif set(intersect).issuperset(set(ik)):  # 生成了超集 or Superset generated
                                    cliqueGraph.remove_node(dic_term_num[frozenset(ik)])
                                    dic_term.pop(frozenset(ik))  # 从四个字典中都删除该节点的信息 or Delete the node's information from all four dictionaries
                                    dic_term_num.pop(frozenset(ik))
                                    dic_intersect_level.pop(frozenset(ik))
                                    dic_temp.pop(frozenset(ik))
                                elif set(intersect).issubset(set(ik)):  # 生成了子集 or Generated subset
                                    q = 1
                                    break
                            if q == 1:
                                continue
                            dic_intersect_level[frozenset(intersect)] = 1

                            if dic_term.has_key(frozenset(intersect)):
                                # 交集已经出现过 or Intersection has appeared
                                parent = cliqueGraph.predecessors(dic_term_num[frozenset(intersect)])
                                children = cliqueGraph.successors(dic_term_num[frozenset(intersect)])
                                #print 'parent',len(parent)
                                if len(parent) > 0:
                                    # 是交集生成的term,则重定向 or  Is the intersection of generated term, then redirect
                                    cliqueGraph.add_node(term, annotation=list(intersect),
                                                         windowsize=value + [window],
                                                         weight=weight_value)
                                    for p in parent:
                                        cliqueGraph.add_edge(p, term)  # 连边 // Edge

                                    for c in children:
                                        cliqueGraph.add_edge(term, c)  # 连边 // edge
                                    cliqueGraph.remove_node(dic_term_num[frozenset(intersect)])  # 从图中删除冗余结点 or Remove redundant nodes from the figure

                                    # print 'deleted intersect nodes:',dic_term_num[frozenset(intersect)]
                                    i = i + 1
                                    dic_term.pop(frozenset(intersect))  # 字典中删除 // Delete in dictionary
                                    dic_term_num.pop(frozenset(intersect))

                                    dic_term[frozenset(intersect)] = value + [window]  # 新节点插入字典 // New node insert dictionary
                                    dic_term_num[frozenset(intersect)] = term
                                    dic_temp[frozenset(intersect)] = value + [window]  # 记录到dic_temp里 // Record to dic_temp
                                    term = term + 1
                                    continue
                                else:
                                    # 是window生成的term // Is the term generated by the window
                                    continue
                            else:
                                # 交集没有出现过, 则生成新的term // No intersection occurs, then a new term is generated
                                # print 'new term intersect never appear:', term
                                cliqueGraph.add_node(term, annotation=list(intersect), windowsize=value + [window],
                                                     weight=weight_value)  # generate a term

                                cliqueGraph.add_edge(dic_term_num[frozenset(key)], term)  # 连边,变化:只连接交集作为父亲。// Edge, change: Only connect intersections as fathers.
                                dic_term[frozenset(intersect)] = value + [window]  # 新节点插入字典 // New node insert dictionary
                                dic_term_num[frozenset(intersect)] = term
                                dic_temp[frozenset(intersect)] = value + [window]  # 记录到dic_temp里 // Record to dic_temp
                                term = term + 1
                        else:
                            continue
                else:
                    continue
            dic_last_time.clear()
            for key, value in dic_temp.items():
                dic_last_time[key] = value
            dic_temp.clear()
    print 'window', startwindow, 'size is', cliqueGraph.number_of_nodes(), cliqueGraph.number_of_edges()## mine
    # print 'deleted nodes:', i
    # fw = open('0904edges_remove.txt', 'w')
    # fw2 = open('0904terms_remove.txt', 'w')
    # fw.write('parent' + '\t' + 'child' + '\n')
    # for edge in cliqueGraph.edges():
    #     fw.write(str(edge[0]) + '\t' + str(edge[1]) + '\n')
    # fw.close()
    # fw2.write('term_id' + '\t' + 'anno_genes' + '\t' + 'window' + '\t' + 'gene_size' + '\t' + 'window_size' + '\n')
    # for key, value in dic_term.items():
    #     fw2.write(str(dic_term_num[key]) + '\t' + str(key) + '\t' + str(value) + '\t' + str(len(key)) + '\t' + str(len(value)) + '\n')
    # fw2.close()
    # for nodes in cliqueGraph.nodes():
    #     if cliqueGraph.degree(nodes) == 0:
    #         print nodes
 
    return cliqueGraph, dic_term, dic_term_num, term