def coreness_features(g): """Correlation of K-core and feature values""" g = g.as_undirected(mode="collapse") all_coreness = g.shell_index(mode='ALL') g.vs['core'] = all_coreness fields = iot.read_fields() for field in fields: gt.add_attribute(g, 'pof', 'fed', 'com', field) vlist = g.vs.select(pof_ne=-1000000000.0)['core'] flist = g.vs.select(pof_ne=-1000000000.0)['pof'] pt.correlation(vlist, flist, 'K-Core', 'Feature', 'data/corerel/'+field+'.pdf')
def coreness_features(g): """Correlation of K-core and feature values""" g = g.as_undirected(mode="collapse") all_coreness = g.shell_index(mode='ALL') g.vs['core'] = all_coreness fields = iot.read_fields() for field in fields: gt.add_attribute(g, 'pof', 'fed', 'com', field) vlist = g.vs.select(pof_ne=-1000000000.0)['core'] flist = g.vs.select(pof_ne=-1000000000.0)['pof'] pt.correlation(vlist, flist, 'K-Core', 'Feature', 'data/corerel/' + field + '.pdf')
def rank_feature(gc, dbname, comname, db_field_names, directed=True): g = gt.giant_component(gc, 'WEAK') g.vs['nt'] = g.degree(type="in") netatt = g.vs['nt'] # ranks = g.pagerank(weights='weight') # g.vs['rank'] = ranks # cor = st.tau_coef(g.degree(type="in"), g.vs['rank']) # print 'Indegree' + '\t' + str(cor[0]) + '\t' + str(cor[1]) # cor = st.tau_coef(g.degree(type="out"), g.vs['rank']) # print 'Outdegree' + '\t' + str(cor[0]) + '\t' + str(cor[1]) for db_field_name in db_field_names: # print 'Processing ' + db_field_name g = gt.add_attribute(g, 'foi', dbname, comname, db_field_name) raw_values = np.array(g.vs['foi']) values = drop_initials(raw_values) if len(values) > 100: # maxv, minv = max(values), min(values) maxv, minv = np.percentile(values, 97.5), np.percentile(values, 2.5) vs = g.vs(foi_ge=minv, foi_le=maxv) sg = g.subgraph(vs) maxd, mind = np.percentile(netatt, 97.5), np.percentile(netatt, 2.5) vs = sg.vs(nt_ge=mind, nt_le=maxd) sg = sg.subgraph(vs) # cor = st.tau_coef(sg.vs['foi'], sg.vs['nt']) # print db_field_name + '\t' + str(len(sg.vs)) + '\t' + str(len(sg.es)) + '\t'\ # + str(min(netatt)) + '\t' + str(max(netatt)) + '\t' + str(mind) + '\t'\ # +str(maxd) + '\t' \ # + str(min(values)) + '\t' + str(max(values)) + '\t' + str(minv) + '\t'\ # +str(maxv) + '\t'\ # + str(cor[0]) + '\t' + str(cor[1]) pt.correlation(sg.vs['nt'], sg.vs['foi'], 'Indegree', 'Feature', 'data/' + db_field_name + '.pdf')
def rank_feature(gc, dbname, comname, db_field_names, directed=True): g = gt.giant_component(gc, "WEAK") g.vs["nt"] = g.degree(type="in") netatt = g.vs["nt"] # ranks = g.pagerank(weights='weight') # g.vs['rank'] = ranks # cor = st.tau_coef(g.degree(type="in"), g.vs['rank']) # print 'Indegree' + '\t' + str(cor[0]) + '\t' + str(cor[1]) # cor = st.tau_coef(g.degree(type="out"), g.vs['rank']) # print 'Outdegree' + '\t' + str(cor[0]) + '\t' + str(cor[1]) for db_field_name in db_field_names: # print 'Processing ' + db_field_name g = gt.add_attribute(g, "foi", dbname, comname, db_field_name) raw_values = np.array(g.vs["foi"]) values = drop_initials(raw_values) if len(values) > 100: # maxv, minv = max(values), min(values) maxv, minv = np.percentile(values, 97.5), np.percentile(values, 2.5) vs = g.vs(foi_ge=minv, foi_le=maxv) sg = g.subgraph(vs) maxd, mind = np.percentile(netatt, 97.5), np.percentile(netatt, 2.5) vs = sg.vs(nt_ge=mind, nt_le=maxd) sg = sg.subgraph(vs) # cor = st.tau_coef(sg.vs['foi'], sg.vs['nt']) # print db_field_name + '\t' + str(len(sg.vs)) + '\t' + str(len(sg.es)) + '\t'\ # + str(min(netatt)) + '\t' + str(max(netatt)) + '\t' + str(mind) + '\t'\ # +str(maxd) + '\t' \ # + str(min(values)) + '\t' + str(max(values)) + '\t' + str(minv) + '\t'\ # +str(maxv) + '\t'\ # + str(cor[0]) + '\t' + str(cor[1]) pt.correlation(sg.vs["nt"], sg.vs["foi"], "Indegree", "Feature", "data/" + db_field_name + ".pdf")
def feature_assort_friend(g, dbname, comname, db_field_names, directed=True): """Using iGraph Assigning values different from zero or one to the adjacency matrix will be translated to one, unless the graph is weighted, in which case the numbers will be treated as weights """ node_size, edge_size = g.vcount(), g.ecount() outputs = {} print ("Feature, #Nodes, #Edges, %Nodes, %Edges, D_assort, F_assort, F_assort, Mean, STD, z_sore, p_value") for db_field_name in db_field_names: # print 'Processing ' + db_field_name g = gt.add_attribute(g, "foi", dbname, comname, db_field_name) raw_values = np.array(g.vs["foi"]) values = drop_initials(raw_values) if len(values) > 100: output = "" # maxv, minv = np.percentile(values, 97.5), np.percentile(values, 2.5) maxv, minv = max(values), min(values) vs = g.vs.select(foi_ge=minv, foi_le=maxv) sg = g.subgraph(vs) t_node_size, t_edge_size = len(sg.vs), len(sg.es) output += ( db_field_name + "," + str(t_node_size) + "," + str(t_edge_size) + "," + str(float(t_node_size) / node_size) + "," + str(float(t_edge_size) / edge_size) + "," + str(sg.assortativity_degree(directed=directed)) + "," + str(sg.assortativity("foi", "foi", directed=directed)) + "," ) raw_assort = sg.assortativity("foi", "foi", directed=directed) ass_list = list() for i in xrange(3000): np.random.shuffle(raw_values) g.vs["foi"] = raw_values vs = g.vs.select(foi_ge=minv, foi_le=maxv) sg = g.subgraph(vs) ass_list.append(sg.assortativity("foi", "foi", directed=directed)) ass_list = np.array(ass_list) amean, astd = np.mean(ass_list), np.std(ass_list) absobserved = abs(raw_assort) pval = (np.sum(ass_list >= absobserved) + np.sum(ass_list <= -absobserved)) / float(len(ass_list)) zscore = (raw_assort - amean) / astd # print pval output += str(raw_assort) + "," + str(amean) + "," + str(astd) + "," + str(zscore) + "," + str(pval) print output if pval < 0.001: output += "***" outputs[output] = abs(zscore) continue if pval < 0.01: output += "**" outputs[output] = abs(zscore) continue if pval < 0.05: output += "*" outputs[output] = abs(zscore) continue else: outputs[output] = abs(zscore) continue return outputs
def feature_assort_friend(g, dbname, comname, db_field_names, directed=True): '''Using iGraph Assigning values different from zero or one to the adjacency matrix will be translated to one, unless the graph is weighted, in which case the numbers will be treated as weights ''' node_size, edge_size = g.vcount(), g.ecount() outputs = {} print( 'Feature, #Nodes, #Edges, %Nodes, %Edges, D_assort, F_assort, F_assort, Mean, STD, z_sore, p_value' ) for db_field_name in db_field_names: # print 'Processing ' + db_field_name g = gt.add_attribute(g, 'foi', dbname, comname, db_field_name) raw_values = np.array(g.vs['foi']) values = drop_initials(raw_values) if len(values) > 100: output = '' # maxv, minv = np.percentile(values, 97.5), np.percentile(values, 2.5) maxv, minv = max(values), min(values) vs = g.vs.select(foi_ge=minv, foi_le=maxv) sg = g.subgraph(vs) t_node_size, t_edge_size = len(sg.vs), len(sg.es) output += db_field_name + ',' + str(t_node_size) + ',' + str(t_edge_size) + ',' \ + str(float(t_node_size)/node_size) + ',' + str(float(t_edge_size)/edge_size)+ ',' \ + str(sg.assortativity_degree(directed=directed)) + ',' \ + str(sg.assortativity('foi', 'foi', directed=directed)) + ',' raw_assort = sg.assortativity('foi', 'foi', directed=directed) ass_list = list() for i in xrange(3000): np.random.shuffle(raw_values) g.vs["foi"] = raw_values vs = g.vs.select(foi_ge=minv, foi_le=maxv) sg = g.subgraph(vs) ass_list.append( sg.assortativity('foi', 'foi', directed=directed)) ass_list = np.array(ass_list) amean, astd = np.mean(ass_list), np.std(ass_list) absobserved = abs(raw_assort) pval = (np.sum(ass_list >= absobserved) + np.sum(ass_list <= -absobserved)) / float(len(ass_list)) zscore = (raw_assort - amean) / astd # print pval output += str(raw_assort) + ',' + str(amean) + ',' + str( astd) + ',' + str(zscore) + ',' + str(pval) print output if pval < 0.001: output += '***' outputs[output] = abs(zscore) continue if pval < 0.01: output += '**' outputs[output] = abs(zscore) continue if pval < 0.05: output += '*' outputs[output] = abs(zscore) continue else: outputs[output] = abs(zscore) continue return outputs
def network_assort(): # test network assortative gs = ['edfollow', 'follow', 'retweet', 'communication'] fields = iot.read_fields() # print len(fields) for gf in gs[1:]: g = gt.Graph.Read_GraphML('data/' + gf + '_net.graphml') # g = gt.giant_component(g) # gt.net_stat(g) sigs = [] for filed in fields: g = gt.add_attribute(g, 'foi', 'depression', 'com', filed) raw_values = np.array(g.vs['foi']) values = drop_initials(raw_values) if len(values) > 100: output = gf + ',' + filed.split('.')[-1] + ',' # maxv, minv = np.percentile(values, 97.5), np.percentile(values, 2.5) maxv, minv = max(values), min(values) vs = g.vs.select(foi_ge=minv, foi_le=maxv) sg = g.subgraph(vs) raw_assort = sg.assortativity('foi', 'foi', directed=True) ass_list = [] for i in xrange(1000): np.random.shuffle(raw_values) g.vs["foi"] = raw_values vs = g.vs.select(foi_ge=minv, foi_le=maxv) sg = g.subgraph(vs) ass_list.append( sg.assortativity('foi', 'foi', directed=True)) ass_list = np.array(ass_list) amean, astd = np.mean(ass_list), np.std(ass_list) absobserved = abs(raw_assort) pval = (np.sum(ass_list >= absobserved) + np.sum(ass_list <= -absobserved)) / float( len(ass_list)) zscore = (raw_assort - amean) / astd output += format(raw_assort, '.2f') + ',' + format(amean, '.2f') + ',' + \ format(astd, '.2f') + ',' + format(zscore, '.2f') + ',' + format(pval, '.3f') + ',' if pval < 0.001: output += '***' if raw_assort > 0: sigs.append('***') print output continue if pval < 0.01: output += '**' if raw_assort > 0: sigs.append('**') print output continue if pval < 0.05: output += '*' if raw_assort > 0: sigs.append('*') print output continue else: sigs.append('N') print output continue c = Counter(sigs) print c for sig, cou in c.items(): print sig, 1.0 * cou / len(fields)