Example #1
0
File: k_core.py Project: wtgme/ohsn
def coreness_features(g):
    """Correlation of K-core and feature values"""
    g = g.as_undirected(mode="collapse")
    all_coreness = g.shell_index(mode='ALL')
    g.vs['core'] = all_coreness
    fields = iot.read_fields()
    for field in fields:
        gt.add_attribute(g, 'pof', 'fed', 'com', field)
        vlist = g.vs.select(pof_ne=-1000000000.0)['core']
        flist = g.vs.select(pof_ne=-1000000000.0)['pof']
        pt.correlation(vlist, flist, 'K-Core', 'Feature', 'data/corerel/'+field+'.pdf')
Example #2
0
def coreness_features(g):
    """Correlation of K-core and feature values"""
    g = g.as_undirected(mode="collapse")
    all_coreness = g.shell_index(mode='ALL')
    g.vs['core'] = all_coreness
    fields = iot.read_fields()
    for field in fields:
        gt.add_attribute(g, 'pof', 'fed', 'com', field)
        vlist = g.vs.select(pof_ne=-1000000000.0)['core']
        flist = g.vs.select(pof_ne=-1000000000.0)['pof']
        pt.correlation(vlist, flist, 'K-Core', 'Feature',
                       'data/corerel/' + field + '.pdf')
Example #3
0
def rank_feature(gc, dbname, comname, db_field_names, directed=True):
    g = gt.giant_component(gc, 'WEAK')

    g.vs['nt'] = g.degree(type="in")
    netatt = g.vs['nt']

    # ranks = g.pagerank(weights='weight')
    # g.vs['rank'] = ranks

    # cor = st.tau_coef(g.degree(type="in"), g.vs['rank'])
    # print 'Indegree' + '\t' + str(cor[0]) + '\t' + str(cor[1])
    # cor = st.tau_coef(g.degree(type="out"), g.vs['rank'])
    # print 'Outdegree' + '\t' + str(cor[0]) + '\t' + str(cor[1])

    for db_field_name in db_field_names:
        # print 'Processing ' + db_field_name
        g = gt.add_attribute(g, 'foi', dbname, comname, db_field_name)
        raw_values = np.array(g.vs['foi'])
        values = drop_initials(raw_values)

        if len(values) > 100:
            # maxv, minv = max(values), min(values)
            maxv, minv = np.percentile(values,
                                       97.5), np.percentile(values, 2.5)
            vs = g.vs(foi_ge=minv, foi_le=maxv)
            sg = g.subgraph(vs)

            maxd, mind = np.percentile(netatt,
                                       97.5), np.percentile(netatt, 2.5)
            vs = sg.vs(nt_ge=mind, nt_le=maxd)
            sg = sg.subgraph(vs)

            # cor = st.tau_coef(sg.vs['foi'], sg.vs['nt'])
            # print db_field_name + '\t' + str(len(sg.vs)) + '\t' + str(len(sg.es)) + '\t'\
            #       + str(min(netatt)) + '\t' + str(max(netatt)) + '\t' + str(mind) + '\t'\
            #       +str(maxd) + '\t' \
            #       + str(min(values)) + '\t' + str(max(values)) + '\t' + str(minv) + '\t'\
            #       +str(maxv) + '\t'\
            #       + str(cor[0]) + '\t' + str(cor[1])
            pt.correlation(sg.vs['nt'], sg.vs['foi'], 'Indegree', 'Feature',
                           'data/' + db_field_name + '.pdf')
Example #4
0
def rank_feature(gc, dbname, comname, db_field_names, directed=True):
    g = gt.giant_component(gc, "WEAK")

    g.vs["nt"] = g.degree(type="in")
    netatt = g.vs["nt"]

    # ranks = g.pagerank(weights='weight')
    # g.vs['rank'] = ranks

    # cor = st.tau_coef(g.degree(type="in"), g.vs['rank'])
    # print 'Indegree' + '\t' + str(cor[0]) + '\t' + str(cor[1])
    # cor = st.tau_coef(g.degree(type="out"), g.vs['rank'])
    # print 'Outdegree' + '\t' + str(cor[0]) + '\t' + str(cor[1])

    for db_field_name in db_field_names:
        # print 'Processing ' + db_field_name
        g = gt.add_attribute(g, "foi", dbname, comname, db_field_name)
        raw_values = np.array(g.vs["foi"])
        values = drop_initials(raw_values)

        if len(values) > 100:
            # maxv, minv = max(values), min(values)
            maxv, minv = np.percentile(values, 97.5), np.percentile(values, 2.5)
            vs = g.vs(foi_ge=minv, foi_le=maxv)
            sg = g.subgraph(vs)

            maxd, mind = np.percentile(netatt, 97.5), np.percentile(netatt, 2.5)
            vs = sg.vs(nt_ge=mind, nt_le=maxd)
            sg = sg.subgraph(vs)

            # cor = st.tau_coef(sg.vs['foi'], sg.vs['nt'])
            # print db_field_name + '\t' + str(len(sg.vs)) + '\t' + str(len(sg.es)) + '\t'\
            #       + str(min(netatt)) + '\t' + str(max(netatt)) + '\t' + str(mind) + '\t'\
            #       +str(maxd) + '\t' \
            #       + str(min(values)) + '\t' + str(max(values)) + '\t' + str(minv) + '\t'\
            #       +str(maxv) + '\t'\
            #       + str(cor[0]) + '\t' + str(cor[1])
            pt.correlation(sg.vs["nt"], sg.vs["foi"], "Indegree", "Feature", "data/" + db_field_name + ".pdf")
Example #5
0
def feature_assort_friend(g, dbname, comname, db_field_names, directed=True):
    """Using iGraph
    Assigning values different from zero or one to the adjacency matrix will be translated to one,
    unless the graph is weighted, in which case the numbers will be treated as weights
    """
    node_size, edge_size = g.vcount(), g.ecount()
    outputs = {}
    print ("Feature, #Nodes, #Edges, %Nodes, %Edges, D_assort, F_assort, F_assort, Mean, STD, z_sore, p_value")
    for db_field_name in db_field_names:
        # print 'Processing ' + db_field_name
        g = gt.add_attribute(g, "foi", dbname, comname, db_field_name)
        raw_values = np.array(g.vs["foi"])
        values = drop_initials(raw_values)

        if len(values) > 100:
            output = ""
            # maxv, minv = np.percentile(values, 97.5), np.percentile(values, 2.5)
            maxv, minv = max(values), min(values)
            vs = g.vs.select(foi_ge=minv, foi_le=maxv)
            sg = g.subgraph(vs)
            t_node_size, t_edge_size = len(sg.vs), len(sg.es)
            output += (
                db_field_name
                + ","
                + str(t_node_size)
                + ","
                + str(t_edge_size)
                + ","
                + str(float(t_node_size) / node_size)
                + ","
                + str(float(t_edge_size) / edge_size)
                + ","
                + str(sg.assortativity_degree(directed=directed))
                + ","
                + str(sg.assortativity("foi", "foi", directed=directed))
                + ","
            )
            raw_assort = sg.assortativity("foi", "foi", directed=directed)
            ass_list = list()
            for i in xrange(3000):
                np.random.shuffle(raw_values)
                g.vs["foi"] = raw_values
                vs = g.vs.select(foi_ge=minv, foi_le=maxv)
                sg = g.subgraph(vs)
                ass_list.append(sg.assortativity("foi", "foi", directed=directed))
            ass_list = np.array(ass_list)
            amean, astd = np.mean(ass_list), np.std(ass_list)

            absobserved = abs(raw_assort)
            pval = (np.sum(ass_list >= absobserved) + np.sum(ass_list <= -absobserved)) / float(len(ass_list))
            zscore = (raw_assort - amean) / astd
            # print pval
            output += str(raw_assort) + "," + str(amean) + "," + str(astd) + "," + str(zscore) + "," + str(pval)
            print output
            if pval < 0.001:
                output += "***"
                outputs[output] = abs(zscore)
                continue
            if pval < 0.01:
                output += "**"
                outputs[output] = abs(zscore)
                continue
            if pval < 0.05:
                output += "*"
                outputs[output] = abs(zscore)
                continue
            else:
                outputs[output] = abs(zscore)
                continue
    return outputs
Example #6
0
def feature_assort_friend(g, dbname, comname, db_field_names, directed=True):
    '''Using iGraph
    Assigning values different from zero or one to the adjacency matrix will be translated to one,
    unless the graph is weighted, in which case the numbers will be treated as weights
    '''
    node_size, edge_size = g.vcount(), g.ecount()
    outputs = {}
    print(
        'Feature, #Nodes, #Edges, %Nodes, %Edges, D_assort, F_assort, F_assort, Mean, STD, z_sore, p_value'
    )
    for db_field_name in db_field_names:
        # print 'Processing ' + db_field_name
        g = gt.add_attribute(g, 'foi', dbname, comname, db_field_name)
        raw_values = np.array(g.vs['foi'])
        values = drop_initials(raw_values)

        if len(values) > 100:
            output = ''
            # maxv, minv = np.percentile(values, 97.5), np.percentile(values, 2.5)
            maxv, minv = max(values), min(values)
            vs = g.vs.select(foi_ge=minv, foi_le=maxv)
            sg = g.subgraph(vs)
            t_node_size, t_edge_size = len(sg.vs), len(sg.es)
            output += db_field_name + ',' + str(t_node_size) + ',' + str(t_edge_size) + ',' \
                      + str(float(t_node_size)/node_size) + ',' + str(float(t_edge_size)/edge_size)+ ',' \
                      + str(sg.assortativity_degree(directed=directed)) + ',' \
                      + str(sg.assortativity('foi', 'foi', directed=directed)) + ','
            raw_assort = sg.assortativity('foi', 'foi', directed=directed)
            ass_list = list()
            for i in xrange(3000):
                np.random.shuffle(raw_values)
                g.vs["foi"] = raw_values
                vs = g.vs.select(foi_ge=minv, foi_le=maxv)
                sg = g.subgraph(vs)
                ass_list.append(
                    sg.assortativity('foi', 'foi', directed=directed))
            ass_list = np.array(ass_list)
            amean, astd = np.mean(ass_list), np.std(ass_list)

            absobserved = abs(raw_assort)
            pval = (np.sum(ass_list >= absobserved) +
                    np.sum(ass_list <= -absobserved)) / float(len(ass_list))
            zscore = (raw_assort - amean) / astd
            # print pval
            output += str(raw_assort) + ',' + str(amean) + ',' + str(
                astd) + ',' + str(zscore) + ',' + str(pval)
            print output
            if pval < 0.001:
                output += '***'
                outputs[output] = abs(zscore)
                continue
            if pval < 0.01:
                output += '**'
                outputs[output] = abs(zscore)
                continue
            if pval < 0.05:
                output += '*'
                outputs[output] = abs(zscore)
                continue
            else:
                outputs[output] = abs(zscore)
                continue
    return outputs
Example #7
0
def network_assort():
    # test network assortative
    gs = ['edfollow', 'follow', 'retweet', 'communication']
    fields = iot.read_fields()
    # print len(fields)
    for gf in gs[1:]:
        g = gt.Graph.Read_GraphML('data/' + gf + '_net.graphml')
        # g = gt.giant_component(g)
        # gt.net_stat(g)
        sigs = []
        for filed in fields:
            g = gt.add_attribute(g, 'foi', 'depression', 'com', filed)
            raw_values = np.array(g.vs['foi'])
            values = drop_initials(raw_values)
            if len(values) > 100:
                output = gf + ',' + filed.split('.')[-1] + ','
                # maxv, minv = np.percentile(values, 97.5), np.percentile(values, 2.5)
                maxv, minv = max(values), min(values)
                vs = g.vs.select(foi_ge=minv, foi_le=maxv)
                sg = g.subgraph(vs)
                raw_assort = sg.assortativity('foi', 'foi', directed=True)
                ass_list = []
                for i in xrange(1000):
                    np.random.shuffle(raw_values)
                    g.vs["foi"] = raw_values
                    vs = g.vs.select(foi_ge=minv, foi_le=maxv)
                    sg = g.subgraph(vs)
                    ass_list.append(
                        sg.assortativity('foi', 'foi', directed=True))

                ass_list = np.array(ass_list)
                amean, astd = np.mean(ass_list), np.std(ass_list)
                absobserved = abs(raw_assort)
                pval = (np.sum(ass_list >= absobserved) +
                        np.sum(ass_list <= -absobserved)) / float(
                            len(ass_list))
                zscore = (raw_assort - amean) / astd
                output += format(raw_assort, '.2f') + ',' + format(amean, '.2f') + ',' + \
                          format(astd, '.2f') + ',' + format(zscore, '.2f') + ',' + format(pval, '.3f') + ','
                if pval < 0.001:
                    output += '***'
                    if raw_assort > 0:
                        sigs.append('***')
                    print output
                    continue
                if pval < 0.01:
                    output += '**'
                    if raw_assort > 0:
                        sigs.append('**')
                    print output
                    continue
                if pval < 0.05:
                    output += '*'
                    if raw_assort > 0:
                        sigs.append('*')
                    print output
                    continue
                else:
                    sigs.append('N')
                    print output
                    continue
        c = Counter(sigs)
        print c
        for sig, cou in c.items():
            print sig, 1.0 * cou / len(fields)