Ejemplo n.º 1
0
def smallworldness(network, rep=1000):

    g = network.g.copy()
    #logger.info(g.summary())
    # there is no point to consider a disconnected graph ( the average path length means nothing)
    g = ResearchCollaborationNetwork.largest_component(g)

    n = len(g.vs)
    m = len(g.es)

    p = float(m) * 2 / (n * (n - 1))

    # sharp threshold: define the connectedness of a ER graph http://en.wikipedia.org/wiki/Erd%C5%91s%E2%80%93R%C3%A9nyi_model
    c = float((np.exp(1) + 1)) * np.log(n) / n

    logger.info(
        "Small-world-ness measure: %d iterations; Erdos_Renyi: p = %f (%d/%d), n  = %d, np = %f, (1 + e) * (ln n / n) = %f"
        % (rep, p, m, (n * (n - 1)) / 2, n, n * p, c))

    ss = []

    for bb in range(rep):
        rg = igraph.Graph.Erdos_Renyi(n, p, directed=False, loops=False)

        s = smallworldness_measure(g, rg)

        ss.append(s)

    mean_s = np.mean(ss)

    return mean_s, ss
Ejemplo n.º 2
0
def centrality_leaders(budgetYears):

	network = load_network_for(budgetYears)

	g = network.g.copy()

	g = ResearchCollaborationNetwork.largest_component(g)

	topK = 10

	candidates, rankings = cl.centrality_leaders(g)

	ordered_list = []
	for r in range(len(rankings))[:topK]:
		#logger.info('tier: %d'%r)
		for i in list(rankings[r]):
			node_name = g.vs[candidates[i]]['name']
			ordered_list.append(node_name)
			# set the node's centrality_leader attribute, the higher the better
			g.vs[candidates[i]]['centrality_leader'] = topK + 1 - r

	startBudgetYear = budgetYears[0]
	endBudgetYear = budgetYears[-1]

	filename = '%s/figures/%s-%s-centrality-leaders.png' % (root_folder(), startBudgetYear, endBudgetYear)
	draw(g, filename)

	logger.info(ordered_list)
Ejemplo n.º 3
0
def smallworldness(network, rep = 1000):

	g = network.g.copy()
	#logger.info(g.summary())
	# there is no point to consider a disconnected graph ( the average path length means nothing)
	g = ResearchCollaborationNetwork.largest_component(g)
	
	n = len(g.vs)
	m = len(g.es)

	p = float(m) * 2 /(n*(n-1))

	# sharp threshold: define the connectedness of a ER graph http://en.wikipedia.org/wiki/Erd%C5%91s%E2%80%93R%C3%A9nyi_model
	c = float((np.exp(1) + 1)) * np.log(n) / n

	logger.info("Small-world-ness measure: %d iterations; Erdos_Renyi: p = %f (%d/%d), n  = %d, np = %f, (1 + e) * (ln n / n) = %f"%(rep, p, m, (n*(n-1))/2, n, n * p,  c))
	
	ss = []

	for bb in range(rep):
		rg = igraph.Graph.Erdos_Renyi(n, p, directed = False, loops = False)

		s = smallworldness_measure(g, rg)

		ss.append(s)

	mean_s = np.mean(ss)

	return mean_s, ss
Ejemplo n.º 4
0
def centrality_leaders(budgetYears):

    network = load_network_for(budgetYears)

    g = network.g.copy()

    g = ResearchCollaborationNetwork.largest_component(g)

    topK = 10

    candidates, rankings = cl.centrality_leaders(g)

    ordered_list = []
    for r in range(len(rankings))[:topK]:
        #logger.info('tier: %d'%r)
        for i in list(rankings[r]):
            node_name = g.vs[candidates[i]]['name']
            ordered_list.append(node_name)
            # set the node's centrality_leader attribute, the higher the better
            g.vs[candidates[i]]['centrality_leader'] = topK + 1 - r

    startBudgetYear = budgetYears[0]
    endBudgetYear = budgetYears[-1]

    filename = '%s/figures/%s-%s-centrality-leaders.png' % (
        root_folder(), startBudgetYear, endBudgetYear)
    draw(g, filename)

    logger.info(ordered_list)
Ejemplo n.º 5
0
def update_graphml(budgetYears):
    startBudgetYear = budgetYears[0]
    endBudgetYear = budgetYears[-1]

    network = load_network_for(budgetYears)

    network.g.vs['centrality_leader'] = 0
    g = network.g.copy()

    g = ResearchCollaborationNetwork.largest_component(g)

    topK = 50

    candidates, rankings = cl.centrality_leaders(g)

    #ordered_list = []
    for r in range(len(rankings))[:topK]:
    #logger.info('tier: %d'%r)
        for i in list(rankings[r]):
            node_name = g.vs[candidates[i]]['name']
            # ordered_list.append(node_name)
            # set the node's centrality_leader attribute, the higher the better

            #g.vs[candidates[i]]['centrality_leader'] = topK + 1 - r
            node = network.g.vs.select(name_eq=node_name)
            node['centrality_leader'] = topK - r
            #logger.info(topK - r)
            # logger.info(node['name'])

    filename = '%s/data/networks/%d-%d.graphml' % (root_folder(),
                                                   startBudgetYear, endBudgetYear)

    network.write(filename)
Ejemplo n.º 6
0
def load_network_for(budgetYears):

	startBudgetYear = budgetYears[0]
	endBudgetYear = budgetYears[-1]
	
	filename = '%s/data/networks/%d-%d.graphml'%(root_folder(),startBudgetYear, endBudgetYear)
	network = ResearchCollaborationNetwork.read(filename)

	return network
Ejemplo n.º 7
0
def load_network_for(budgetYears):

    startBudgetYear = budgetYears[0]
    endBudgetYear = budgetYears[-1]

    filename = '%s/data/networks/%d-%d.graphml' % (
        root_folder(), startBudgetYear, endBudgetYear)
    network = ResearchCollaborationNetwork.read(filename)

    return network
Ejemplo n.º 8
0
def rwr_scores(budgetYears):
    startBudgetYear = budgetYears[0]
    endBudgetYear = budgetYears[-1]

    logger.info('---------------- %s-%s -------------------' %
                (startBudgetYear, endBudgetYear))

    network = load_network_for(budgetYears)

    #network = ResearchCollaborationNetwork.read(budgetYears)

    g = network.g.copy()
    ResearchCollaborationNetwork.simplify(g)

    logger.info(g.summary())

    adj = np.array(g.get_adjacency(igraph.GET_ADJACENCY_BOTH).data)

    links = []
    m = len(g.vs)
    for i in range(m):
        for j in range(i + 1, m):
            key = '%d,%d' % (i, j)
            links.append(key)

    rwr_scores = pgrank.rwr_score(g, links)

    rwrs = {}
    for link, score in rwr_scores.items():
        v = link.split(',')
        v1 = int(v[0])
        v2 = int(v[1])

        key = '%s,%s' % (g.vs[v1]['name'], g.vs[v2]['name'])
        if(float(score) > 0.001):
            rwrs[key] = score

    filename = '%s/data/networks/%d-%d-rwr.json' % (root_folder(),
                                                    startBudgetYear, endBudgetYear)

    with open(filename, 'w') as out:
        json.dump(rwrs, out)
Ejemplo n.º 9
0
def rwr_scores(budgetYears):
    startBudgetYear = budgetYears[0]
    endBudgetYear = budgetYears[-1]

    logger.info('---------------- %s-%s -------------------' %
                (startBudgetYear, endBudgetYear))

    network = load_network_for(budgetYears)

    #network = ResearchCollaborationNetwork.read(budgetYears)

    g = network.g.copy()
    ResearchCollaborationNetwork.simplify(g)

    logger.info(g.summary())

    adj = np.array(g.get_adjacency(igraph.GET_ADJACENCY_BOTH).data)

    links = []
    m = len(g.vs)
    for i in range(m):
        for j in range(i + 1, m):
            key = '%d,%d' % (i, j)
            links.append(key)

    rwr_scores = pgrank.rwr_score(g, links)

    rwrs = {}
    for link, score in rwr_scores.items():
        v = link.split(',')
        v1 = int(v[0])
        v2 = int(v[1])

        key = '%s,%s' % (g.vs[v1]['name'], g.vs[v2]['name'])
        if (float(score) > 0.001):
            rwrs[key] = score

    filename = '%s/data/networks/%d-%d-rwr.json' % (
        root_folder(), startBudgetYear, endBudgetYear)

    with open(filename, 'w') as out:
        json.dump(rwrs, out)
Ejemplo n.º 10
0
def get_data(budgetYears):
	network = load_network_for(budgetYears)

	wg = network.g.copy()

	wg = ResearchCollaborationNetwork.simplify(wg)

	degree = np.array(wg.degree(), dtype=np.int)

	strength = np.array(wg.strength(loops=False,weights=wg.es['weight']), dtype=np.int)

	return wg, degree, strength
Ejemplo n.º 11
0
def average_strength_for(budgetYears):

	logger.info(budgetYears)

	network = load_network_for(budgetYears)
	g = network.g.copy()

	# pick the largest component of the network, the subgraph without any isolated nodes (nodes that are not connected to any other nodes)
	g = ResearchCollaborationNetwork.largest_component(g)

	g = set_category_by_is_ctsa(g, refG)

	logger.info('ctsa: %0.2f'%average_strength(g, 1.0))
	logger.info('non-ctsa: %0.2f'%average_strength(g, 0.0))
Ejemplo n.º 12
0
def draw_g(budgetYears):
	network = load_network_for(budgetYears)
	
	g = network.g.copy()
	#g = g.simplify(multiple=True, loops=True,combine_edges=sum)

	# convert to undirected
	#g.to_undirected(combine_edges=sum)

	g = ResearchCollaborationNetwork.simplify(g)

	startBudgetYear = budgetYears[0]
	endBudgetYear = budgetYears[-1]

	filename = '%s/figures/%s-%s-%d.png'%(root_folder(),startBudgetYear, endBudgetYear,len(g.vs))
	#logger.info(g.summary())
	draw(g, filename)

	gl = ResearchCollaborationNetwork.largest_component(g)
	
	filename = '%s/figures/%s-%s-%d-largest-component.png'%(root_folder(),startBudgetYear, endBudgetYear,len(gl.vs))

	draw(gl, filename)
Ejemplo n.º 13
0
def average_shortest_path_for(budgetYears):

	logger.info(budgetYears)

	network = load_network_for(budgetYears)
	g = network.g.copy()

	# pick the largest component of the network, the subgraph without any isolated nodes (nodes that are not connected to any other nodes)
	g = ResearchCollaborationNetwork.largest_component(g)

	g = set_category_by_is_ctsa(g, refG)

	weights = [ 1/weight for weight in g.es['weight']]

	logger.info('within non-CTSA investigators: %0.3f'%average_shortest_path(g, weights = weights, source = 0.0, target = 0.0))
	logger.info('within CTSA investigators: %0.3f'%average_shortest_path(g, weights = weights, source=1.0, target = 1.0))
	#logger.info('from CTSA to non-CTSA: %0.3f'%average_shortest_path(g, weights = weights, source = 1.0, target = 0.0))
	logger.info('from non-CTSA to all: %0.3f'%average_shortest_path(g, weights = weights, source = 0.0, target = None))
	logger.info('from CTSA to all: %0.3f'%average_shortest_path(g, weights = weights, source = 1.0, target = None))
Ejemplo n.º 14
0
def network_to_d3(budgetYears):

    network = load_network_for(budgetYears)
    #network = ResearchCollaborationNetwork.read(budgetYears)
    startBudgetYear = budgetYears[0]
    endBudgetYear = budgetYears[-1]
    filename = '%s/data/networks/%s-%s-complete.json' % (
        root_folder(), startBudgetYear, endBudgetYear)
    ResearchCollaborationNetwork.d3(network.g, filename)

    # remove isolated nodes
    g = network.g.copy()
    g = ResearchCollaborationNetwork.simplify(g)
    filename = '%s/data/networks/%s-%s.json' % (root_folder(), startBudgetYear,
                                                endBudgetYear)
    ResearchCollaborationNetwork.d3(g, filename)

    # only the largest components
    g = network.g.copy()
    g = ResearchCollaborationNetwork.largest_component(g)
    filename = '%s/data/networks/%s-%s-largest-component.json' % (
        root_folder(), startBudgetYear, endBudgetYear)
    ResearchCollaborationNetwork.d3(g, filename)
Ejemplo n.º 15
0
def network_to_d3(budgetYears):

    network = load_network_for(budgetYears)
    #network = ResearchCollaborationNetwork.read(budgetYears)
    startBudgetYear = budgetYears[0]
    endBudgetYear = budgetYears[-1]
    filename = '%s/data/networks/%s-%s-complete.json' % (root_folder(),
                                                         startBudgetYear, endBudgetYear)
    ResearchCollaborationNetwork.d3(network.g, filename)

    # remove isolated nodes
    g = network.g.copy()
    g = ResearchCollaborationNetwork.simplify(g)
    filename = '%s/data/networks/%s-%s.json' % (root_folder(),
                                                startBudgetYear, endBudgetYear)
    ResearchCollaborationNetwork.d3(g, filename)

    # only the largest components
    g = network.g.copy()
    g = ResearchCollaborationNetwork.largest_component(g)
    filename = '%s/data/networks/%s-%s-largest-component.json' % (root_folder(),
                                                                  startBudgetYear, endBudgetYear)
    ResearchCollaborationNetwork.d3(g, filename)
Ejemplo n.º 16
0
def update_graphml(budgetYears):
    startBudgetYear = budgetYears[0]
    endBudgetYear = budgetYears[-1]

    network = load_network_for(budgetYears)

    network.g.vs['centrality_leader'] = 0
    g = network.g.copy()

    g = ResearchCollaborationNetwork.largest_component(g)

    topK = 50

    candidates, rankings = cl.centrality_leaders(g)

    #logger.info(candidates)
    #logger.info(rankings)

    #ordered_list = []
    for r in range(len(rankings))[:topK]:

        logger.info('tier: %d' % r)

        for i in list(rankings[r]):
            node_name = g.vs[candidates[i]]['name']
            # ordered_list.append(node_name)
            # set the node's centrality_leader attribute, the higher the better

            #g.vs[candidates[i]]['centrality_leader'] = topK + 1 - r
            node = network.g.vs.select(name_eq=node_name)
            #logger.info(node['name'])
            node['centrality_leader'] = r + 1
            #logger.info(topK - r)
            # logger.info(node['name'])

    filename = '%s/data/networks/%d-%d.graphml' % (
        root_folder(), startBudgetYear, endBudgetYear)

    network.write(filename)
Ejemplo n.º 17
0
def per_network(budgetYears):

	startBudgetYear = budgetYears[0]
	endBudgetYear = budgetYears[-1]

	logger.info('---------------- %s-%s -------------------'%(startBudgetYear, endBudgetYear))

	network = load_network_for(budgetYears)

	g = network.g.copy()

	ResearchCollaborationNetwork.simplify(g)

	logger.info(g.summary())

	# randomly pick 20 users 
	candidates = range(len(g.vs))
	shuffle(candidates)
	candidates = candidates[:20]

	adj = np.array(g.get_adjacency(igraph.GET_ADJACENCY_BOTH).data)

	m, _ = adj.shape

	nonobservedlinks = {}
	
	nonobserved_actual_edges = []

	nonobserved_nonexist_edges = []

	for i in range(m):
		# undirectd graph, so only care if the source is in candidates or not
		if i not in candidates:
			continue
		for j in range(i + 1, m):
			key = '%d,%d'%(i,j)
			nonobservedlinks[key] = adj[i,j]

			if adj[i,j] > 0:
				nonobserved_actual_edges.append(key)
			else:
				nonobserved_nonexist_edges.append(key)

	#logger.info('-----original graph:-----\r\n %s \r\n -----end original graph:-----'%g.summary())
	
	auc = 0.0
	
	apk = {3: 0.0, 5: 0.0, 10: 0.0}

	kfold = 10
	
	cnt = 0;
	
	roc_samples = []



	for ((es_p_training, es_p_validation), (es_m_training, es_m_validation)) in zip(utils.k_fold_cross_validation(list(nonobserved_actual_edges), kfold), utils.k_fold_cross_validation(list(nonobserved_nonexist_edges), kfold)):
		
		logger.info('--------iteration %d-------------'%cnt)

		logger.info('xxxxxxxxxxxxxxxxxxxxxxxx')
		logger.info('positive training: %d'%len(es_p_training))
		logger.info('positive validation: %d'%len(es_p_validation))
		logger.info('------------------------')
		logger.info('negative training: %d'%len(es_m_training))
		logger.info('negative validation: %d'%len(es_m_validation))
		#logger.info('xxxxxxxxxxxxxxxxxxxxxxxx')

		training = es_p_training + es_m_training
		validation = es_p_validation + es_m_validation

		#logger.info('training: %d; valiation: %d'%(len(training), len(validation)))
		
		# create training graph
		trainingG = g.copy()
		
		edges_2_delete = []
		#// remove edges from the validation set
		for link in validation:
			v = link.split(',')
			v1 = int(v[0])
			v2 = int(v[1])
			eId = trainingG.get_eid(v1,v2, directed=False, error=False)
			if eId != -1:
				edges_2_delete.append(eId)

		trainingG.delete_edges(edges_2_delete)

		#logger.info('-----training graph:-----\r\n %s \r\n -----end training graph:-----'%trainingG.summary())

		rwr_scores = pgrank.rwr_score(trainingG, validation)

		actual = []
		posterior = []
		actual_edges = []

		for k in validation:
			actual.append(nonobservedlinks[k])
			if nonobservedlinks[k] > 0:
				actual_edges.append(k)
			
			posterior.append(rwr_scores[k])
			
			roc_samples.append((k, nonobservedlinks[k], rwr_scores[k]))

		#logger.info('actual edges: %s'%actual_edges)		
		#logger.info('posterior: %s'%posterior)

		auc_ = benchmarks.auc(actual, posterior)
		auc += auc_

		#area, [ax, lines] = roc.roc_curve(labels=np.array(actual),scores=np.array(posterior))

		for topK, p in apk.iteritems():
			predictedIndexes = sorted(range(len(posterior)), reverse=True, key=lambda k: posterior[k])[:topK]
			predicted = np.array(validation)[predictedIndexes]

			apk_ = benchmarks.apk(actual_edges, predicted, topK)
			apk[topK] += apk_	


		cnt += 1


	# take a look at http://www.machinedlearnings.com/2012/06/thought-on-link-prediction.html
	logger.info('auc: %f'%(auc/kfold))
	for topK, p in apk.iteritems():
		logger.info('ap@%d: %f'%(topK, (apk[topK]/kfold)))

	#plt.show()
	np.save('%s/data/%s-%s.per_network.roc.samples.npy'%(root_folder(),startBudgetYear, endBudgetYear), np.array(roc_samples))
Ejemplo n.º 18
0
def network_characteristics(budgetYears):

    logger.info(
        "================================================================")
    logger.info(budgetYears)

    network = load_network_for(budgetYears)

    g = network.g.copy()

    # simplified network is the one without any isolated nodes (nodes that are not connected to any other nodes)
    g = ResearchCollaborationNetwork.simplify(g)

    logger.info('# of nodes: %d' % (len(g.vs)))

    logger.info('# of edges: %d' % (len(g.es)))

    logger.info('density: %.3f' % (g.density()))

    new_edges = 0.0

    # 2006 is the baseline
    if budgetYears[0] > 2006:
        if budgetYears[0] == 2010 and budgetYears[-1] == 2012:
            pBudgetYears = range(2006, 2010)
        else:
            pBudgetYears = np.array(budgetYears) - 1

        pNetwork = load_network_for(pBudgetYears)
        pg = pNetwork.g.copy()
        pg = ResearchCollaborationNetwork.simplify(pg)

        new_edges = average_number_of_new_edges(g, pg)
    logger.info('average number of new edges: %.3f' % new_edges)

    logger.info('# of isolated components: %d' %
                (num_of_isolated_components(g)))

    # only the largest component, mainly because shortest path length is rather arbitrary on graphs with isolated components, which our RCNs are.
    g = ResearchCollaborationNetwork.largest_component(g)
    weights = g.es['weight']
    r_weights = [1 / float(weight) for weight in g.es['weight']]
    no_weigths = [1 for weight in g.es['weight']]

    logger.info('# of nodes (largest component): %d' % (len(g.vs)))

    logger.info('# of edges (largest component): %d' % (len(g.es)))

    C_g = g.transitivity_avglocal_undirected(mode='zero', weights=no_weigths)
    logger.info('C_g (weights = None): %.3f' % C_g)

    C_wg = g.transitivity_avglocal_undirected(mode='zero', weights=weights)
    logger.info('C_g (weights = number of collaborations): %.3f' % C_wg)

    C_tg = g.transitivity_undirected(mode='zero')
    logger.info('C_g (triplets definition): %.3f' % C_tg)

    L_g = average_shortest_path_length_weighted(g, no_weigths)
    logger.info("L_g (weights = 1): %.3f" % L_g)

    L_wg = average_shortest_path_length_weighted(g, r_weights)
    logger.info("L_g (weights = 1/weights): %.3f" % L_wg)

    D_wg = diversity(g, r_weights)
    logger.info("D_g (weights = 1/weights): %.3f" % D_wg)
Ejemplo n.º 19
0
def per_candidate(budgetYears):
	startBudgetYear = budgetYears[0]
	endBudgetYear = budgetYears[-1]

	logger.info('---------------- %s-%s -------------------'%(startBudgetYear, endBudgetYear))

	network = load_network_for(budgetYears)

	g = network.g.copy()

	ResearchCollaborationNetwork.simplify(g)

	logger.info(g.summary())

	adj = np.array(g.get_adjacency(igraph.GET_ADJACENCY_BOTH).data)

	m, _ = adj.shape

	cNodes = g.vs.select(_degree_gt=15) #range(len(g.vs))
	candidates = []
	for cNode in cNodes:
		candidates.append(cNode.index)

	shuffle(candidates)
	candidates = candidates[:10]

	total_auc = 0.0
	precision_at_k = {3: 0.0, 5: 0.0, 10: 0.0}
	mapk = precision_at_k
	kfold = 5	

	roc_samples = []

	progress = len(candidates)

	# for each candidate we do training and testing...
	for c in candidates:		

		logger.info('%d-----------------------'%progress)

		nonobservedlinks = {}
	
		nonobserved_actual_edges = []

		nonobserved_nonexist_edges = []
		
		# undirectd graph, so only care if the source is in candidates or not
		for j in range(m):
			key = '%d,%d'%(c,j)
			nonobservedlinks[key] = adj[c,j]
			#logger.info(adj[c,j])

			if adj[c,j] > 0:
				nonobserved_actual_edges.append(key)
			else:
				nonobserved_nonexist_edges.append(key)

		cnt = 0
		auc = 0.0
		#average precision at k is defined per candidate
		apk = precision_at_k
		for ((es_p_training, es_p_validation), (es_m_training, es_m_validation)) in zip(utils.k_fold_cross_validation(list(nonobserved_actual_edges), kfold), utils.k_fold_cross_validation(list(nonobserved_nonexist_edges), kfold)):
		
			#logger.info('--------iteration %d-------------'%cnt)

			#logger.info('xxxxxxxxxxxxxxxxxxxxxxxx')
			#logger.info('positive training: %d'%len(es_p_training))
			#logger.info('positive validation: %d'%len(es_p_validation))
			#logger.info('------------------------')
			#logger.info('negative training: %d'%len(es_m_training))
			#logger.info('negative validation: %d'%len(es_m_validation))
			#logger.info('xxxxxxxxxxxxxxxxxxxxxxxx')

			training = es_p_training + es_m_training
			validation = es_p_validation + es_m_validation

			#logger.info('training: %d; valiation: %d'%(len(training), len(validation)))
			
			# create training graph
			trainingG = g.copy()

			edges_2_delete = []
			#// remove edges from the validation set
			for link in validation:
				v = link.split(',')
				v1 = int(v[0])
				v2 = int(v[1])
				eId = trainingG.get_eid(v1,v2, directed=False, error=False)
				if eId != -1:
					edges_2_delete.append(eId)

			trainingG.delete_edges(edges_2_delete)

			#logger.info('-----training graph:-----\r\n %s \r\n -----end training graph:-----'%trainingG.summary())

			rwr_scores = pgrank.rwr_score(trainingG, validation)

			for k, rwr_score in rwr_scores.iteritems():
				if rwr_score > 1:
					logger.info('overflow? rwr_score: %0.2f'%(rwr_score))

			actual = []
			posterior = []
			actual_edges = []

			for k in validation:
				actual.append(nonobservedlinks[k])
				if nonobservedlinks[k] > 0:
					actual_edges.append(k)
				
				posterior.append(rwr_scores[k])
				
				roc_samples.append((k, nonobservedlinks[k], rwr_scores[k]))

			#logger.info('actual edges: %s'%actual_edges)		
			#logger.info('posterior: %s'%posterior)

			auc_ = benchmarks.auc(actual, posterior)
			auc += auc_
			total_auc += auc_

			#area, [ax, lines] = roc.roc_curve(labels=np.array(actual),scores=np.array(posterior))

			for topK, p in mapk.iteritems():
				predictedIndexes = sorted(range(len(posterior)), reverse=True, key=lambda k: posterior[k])[:topK]
				predicted = np.array(validation)[predictedIndexes]

				apk_ = benchmarks.apk(actual_edges, predicted, topK)
				apk[topK] += apk_	
				mapk[topK] += apk_

			cnt += 1	

		logger.info('%d: auc: %f'%(c, float(auc)/kfold))

		for topK, p in apk.iteritems():
			logger.info('%d: ap@%d: %f'%(c, topK, (apk[topK]/kfold)))

		progress -= 1	

	logger.info('auc: %f'%(float(total_auc)/(kfold*len(candidates))))
	for topK, p in mapk.iteritems():
		logger.info('map@%d: %f'%(topK, (mapk[topK]/(kfold*len(candidates)))))

	np.save('%s/data/%s-%s.per_user.roc.samples.npy'%(root_folder(),startBudgetYear, endBudgetYear), np.array(roc_samples))
Ejemplo n.º 20
0
def network_characteristics(budgetYears):

	logger.info("================================================================")
	logger.info(budgetYears)

	network = load_network_for(budgetYears)

	g = network.g.copy()

	# simplified network is the one without any isolated nodes (nodes that are not connected to any other nodes)
	g = ResearchCollaborationNetwork.simplify(g)

	logger.info('# of nodes: %d'%(len(g.vs)))

	logger.info('# of edges: %d'%(len(g.es)))

	logger.info('density: %.3f'%(g.density()))

	new_edges = 0.0

	# 2006 is the baseline
	if budgetYears[0] > 2006:
		if budgetYears[0]  == 2010 and budgetYears[-1] == 2012:
			pBudgetYears = range(2006,2010)
		else:
			pBudgetYears = np.array(budgetYears) - 1

		pNetwork = load_network_for(pBudgetYears)
		pg = pNetwork.g.copy()
		pg = ResearchCollaborationNetwork.simplify(pg)

		new_edges = average_number_of_new_edges(g, pg)
	logger.info('average number of new edges: %.3f'%new_edges)

	logger.info('# of isolated components: %d'%(num_of_isolated_components(g)))


	# only the largest component, mainly because shortest path length is rather arbitrary on graphs with isolated components, which our RCNs are.
	g = ResearchCollaborationNetwork.largest_component(g)
	weights = g.es['weight']
	r_weights = [ 1/float(weight) for weight in g.es['weight']]
	no_weigths = [ 1 for weight in g.es['weight']]

	logger.info('# of nodes (largest component): %d'%(len(g.vs)))

	logger.info('# of edges (largest component): %d'%(len(g.es)))

	C_g = g.transitivity_avglocal_undirected(mode='zero', weights=no_weigths)
	logger.info('C_g (weights = None): %.3f'%C_g)

	C_wg = g.transitivity_avglocal_undirected(mode='zero', weights=weights)
	logger.info('C_g (weights = number of collaborations): %.3f'%C_wg)

	C_tg = g.transitivity_undirected(mode='zero')
	logger.info('C_g (triplets definition): %.3f'%C_tg)

	L_g = average_shortest_path_length_weighted(g, no_weigths)
	logger.info("L_g (weights = 1): %.3f"%L_g)

	L_wg = average_shortest_path_length_weighted(g, r_weights)
	logger.info("L_g (weights = 1/weights): %.3f"%L_wg)

	D_wg = diversity(g, r_weights)
	logger.info("D_g (weights = 1/weights): %.3f"%D_wg)