def test_write_state_file(random_paths, k, tmpdir): file_path = str(tmpdir.mkdir("sub").join("multi_order_state")) p = random_paths(20, 40, 6) multi = pp.MultiOrderModel(p, max_order=k) for i in range(1, k+1): multi.save_state_file(file_path + '.' + str(i), layer=i)
def test_single_path_likelihood(random_paths): p1 = random_paths(size=10, rnd_seed=20, num_nodes=10) # type: pp.Paths p2 = random_paths(size=100, rnd_seed=0, num_nodes=50) p12 = p1 + p2 mom = pp.MultiOrderModel(p12, max_order=3) lkh1 = mom.likelihood(p1) lkh2 = mom.likelihood(p2) lkh12 = mom.likelihood(p12) assert lkh1 > lkh2 # second paths must be assert (lkh1 + lkh2) == pytest.approx(lkh12) assert mom.path_likelihood(('1', '2'), layer=0, freq=4) < 0 lkl_last = None for i in range(3): # likelihoods must be increasing lkl = mom.path_likelihood(('6', '7', '2', '0', '6'), layer=i, freq=9) if lkl_last is not None: assert lkl >= lkl_last lkl_last = lkl path_likelihoods = [] for p, freq in p12.paths[3].items(): # print the path with the highest likelihood lkl = mom.path_likelihood(p, layer=2, freq=freq.sum(), log=False) path_likelihoods.append((lkl, p)) assert max(path_likelihoods)[1] == ('23', '32', '19', '8')
def test_save_statefile(random_paths, tmpdir): file_path = str(tmpdir.join("statefile.sf")) p = random_paths(3, 20, 6) multi = pp.MultiOrderModel(p, max_order=2) multi.save_state_file(file_path, layer=2) with open(file_path) as f: for line in f: assert '{' not in line # make sure that we did not write a dictionary
def test_estimate_order_2(): # Example with second-order correlations paths = pp.Paths() paths.add_path('a,c') paths.add_path('b,c') paths.add_path('c,d') paths.add_path('c,e') for k in range(4): paths.add_path('a,c,d') paths.add_path('b,c,e') m = pp.MultiOrderModel(paths, max_order=2) assert m.estimate_order() == 2
def test_estimate_order_1(): """Example without second-order correlations""" paths = pp.Paths() paths.add_path('a,c') paths.add_path('b,c') paths.add_path('c,d') paths.add_path('c,e') for k in range(4): paths.add_path('a,c,d') paths.add_path('b,c,e') paths.add_path('b,c,d') paths.add_path('a,c,e') m = pp.MultiOrderModel(paths, max_order=2) assert m.estimate_order() == 1, \ "Error, wrongly detected higher-order correlations"
def test_estimate_order_2(): # Example with second-order correlations paths = pp.Paths() paths.addPath('a,c') paths.addPath('b,c') paths.addPath('c,d') paths.addPath('c,e') for k in range(4): paths.addPath('a,c,d') paths.addPath('b,c,e') m = pp.MultiOrderModel(paths, maxOrder=2) assert m.estimateOrder( paths) == 2, "Error, did not detect second-order correlations" x = list(map(str, _np.random.choice(range(10), 100000))) ms = pp.MarkovSequence(x) assert ms.estimateOrder(maxOrder=2, method='BIC') == 1, \ "Error, wrongly detected higher-order correlations" assert ms.estimateOrder(maxOrder=2, method='AIC') == 1, \ "Error, wrongly detected higher-order correlations" g1 = pp.HigherOrderNetwork(paths, k=1) assert g1.vcount() == 5, \ "Error, wrong number of nodes in first-order network" assert g1.ecount() == 4, \ "Error, wrong number of links in first-order network" g2 = pp.HigherOrderNetwork(paths, k=2) assert g2.vcount() == 4, \ "Error, wrong number of nodes in second-order network" assert g2.ecount() == 2, \ "Error, wrong number of links in second-order network" g2.reduceToGCC() assert g2.vcount() == 1, \ "Error, wrong number of nodes in giant connected component" assert g2.ecount() == 0, \ "Error, wrong number of links in giant connected component"
#%% In [1] import pathpy as pp t = pp.TemporalNetwork.read_file('data/temporal_clusters.tedges') paths = pp.path_extraction.paths_from_temporal_network_dag(t) mog = pp.MultiOrderModel(paths, 3) # Color nodes according to known ground-truth clusters clusters = { v: 'red' if len(v)<2 else ('green' if v.startswith('1') else 'blue') for v in paths.nodes} pp.visualisation.plot(mog.layers[mog.estimate_order()], plot_higher_order_nodes=False, node_color=clusters) #%% In [2] from random import shuffle edges = [(v,w) for (v,w,t) in t.tedges] times = [t for (v,w,t) in t.tedges] shuffle(times) t_shuffled = pp.TemporalNetwork() for i in range(len(edges)): t_shuffled.add_edge(edges[i][0], edges[i][1], times[i]) paths = pp.path_extraction.paths_from_temporal_network_dag(t_shuffled) mog = pp.MultiOrderModel(paths, 3) clusters = { v: 'red' if len(v)<2 else ('green' if v.startswith('1') else 'blue') for v in paths.nodes} pp.visualisation.plot(mog.layers[mog.estimate_order()], plot_higher_order_nodes=False, node_color=clusters)
hon_1 = pp.HigherOrderNetwork(p, k=1) hon_2 = pp.HigherOrderNetwork(p, k=2, null_model=True) hon_5 = pp.HigherOrderNetwork(p, k=5, null_model=True) print(hon_1.likelihood(p, log=False)) print(hon_2.likelihood(p, log=False)) print(hon_5.likelihood(p, log=False)) #%% In [9] print('Path consists of {0} nodes'.format(len(path))) print('Number of transitions in first-order model = ', str(len(hon_1.path_to_higher_order_nodes(path)[1:]))) print('Number of transitions in second-order model = ', str(len(hon_2.path_to_higher_order_nodes(path)[1:]))) print('Number of transitions in fifth-order model = ', str(len(hon_5.path_to_higher_order_nodes(path)[1:]))) #%% In [10] mog = pp.MultiOrderModel(toy_paths, max_order=2) print(mog) pp.visualisation.plot(mog.layers[0]) pp.visualisation.plot(mog.layers[1]) pp.visualisation.plot(mog.layers[2]) #%% In [11] mog = pp.MultiOrderModel(toy_paths, max_order=2) d = mog.degrees_of_freedom(max_order=2) - mog.degrees_of_freedom(max_order=1) x = - 2 * (mog.likelihood(toy_paths, log=True, max_order=1) - mog.likelihood(toy_paths, log=True, max_order=2)) p = 1 - chi2.cdf(x, d) print('p value of null hypothesis that data has maximum order 1 = {0}'.format(p))
def test_summary_multi_order_model(random_paths): p = random_paths(90, 90) multi = pp.MultiOrderModel(paths=p, maxOrder=3) print(multi)
def estimate_user_kopt(user, top_nodes): USER = user ##PATH COLLECTION paths = list() path = list() filename = PATH + FILENAME with open(filename, 'r', encoding='utf-8') as csvfile: csv_reader = csv.reader(csvfile, delimiter='\t') print(f"Parsed file: {FILENAME}") line_count = 0 user_count = 0 user_last_clicks = {} for row in csv_reader: # Ignoring header row if line_count == 0: print(f'Columns: {", ".join(row)}') line_count += 1 # Ignoring data from other users elif USER == "all": line_count += 1 user = row[2] article = row[3] game = row[4] if user_last_clicks.get('game', "") == game: if user_last_clicks['article'] != article: path.append(article) else: if len(path) != 0: paths.append(path) path = list() path.append(article) user_last_clicks = {"article": article, "game": game} elif row[2] == USER: line_count += 1 user = row[2] article = row[3] game = row[4] if user_last_clicks.get('game', "") == game: if user_last_clicks['article'] != article: path.append(article) else: if len(path) != 0: paths.append(path) path = list() path.append(article) user_last_clicks = {"article": article, "game": game} else: continue ##PATH FILTERING top_node_number=top_nodes flat_list=Counter([item for path in paths for item in path]) #print(flat_list) sorted_nodes=[ x[0] for x in sorted( flat_list.items() , key=lambda x: x[1], reverse=True)] top_sorted_nodes=sorted_nodes[0:top_node_number] #print(top_sorted_nodes, end="\n\n") paths_reduced = list() for path in paths: runs = listrun(path, top_sorted_nodes) for run in runs: paths_reduced.append(run) #print(paths_reduced) ## Add paths to pathpy p = pp.Paths() for path in paths_reduced: p.add_path(path) print(p) mog = pp.MultiOrderModel(p, max_order=2) #print('Optimal order = ', mog.estimate_order()) return (len(paths_reduced), mog.estimate_order())
def test_print(random_paths): p = random_paths(90, 0, 20) multi = pp.MultiOrderModel(p, max_order=3) print(multi)
def test_test_network_hypothesis_values(random_paths, k, method, e_ic0, e_ic1): p = random_paths(20, 40, 6) multi = pp.MultiOrderModel(p, max_order=k) (is_net, ic0, ic1) = multi.test_network_hypothesis(p, method=method) assert e_ic0 == pytest.approx(ic0) assert e_ic1 == pytest.approx(ic1)
def test_test_network_hypothesis(random_paths, k, method): p = random_paths(20, 40, 6) multi = pp.MultiOrderModel(p, max_order=k) (is_net, ic0, ic1) = multi.test_network_hypothesis(p, method=method)
def test_init(random_paths, k): p = random_paths(90, 0, 20) multi = pp.MultiOrderModel(p, max_order=k) assert len(multi.layers) == k+1
for order in range(2, 5): # generate random (strongly connected) network g = igraph.Graph.Erdos_Renyi(n=n, m=int(n * deg), directed=True, loops=True).clusters(mode='STRONG').giant() # generate k-th-order path model pathModel = KOrderPathModel.KOrderPathModel(g, k=int(order)) batch = 1 while batch <= 20: try: pathset = pathModel.generatePaths(pathCount=ceil(10**(0.25 * batch)), pathLength=20) model = pp.MultiOrderModel(pathset, max_order=order + 1) #estimate optimal order optimal_order = model.estimate_order() print('k = ' + str(order) + ', batch = ' + str(batch) + ':') print('the optimal Markovian order of the data is ' + str(optimal_order)) detected_order[order - 2].append(optimal_order) except: print('k = ' + str(order) + ', batch = ' + str(batch) + ' PathsTooShort') print('retrying...') else: batch += 1 print(detected_order)
original_path_set.pop() for i in range(len(original_path_set)): temp = original_path_set[i].split('+') temp.remove('') if i != 0: temp.remove('') original_path_set[i] = temp #extract paths without redundant nodes real_path_set = pathpy.Paths() print('generating pathset without redundant nodes...') break_and_add_path(original_path_set, real_path_set) print('information of the pathset without redundant nodes:') print(real_path_set) #The high-order model is generated from the path set S and named as Model. The maximum order is preliminarily set as 5. If the final estimated optimal order is the same as the maximum order, the maximum order should be increased max_order = 5 success = False while not success: try: model = pathpy.MultiOrderModel(real_path_set, max_order=max_order) success = True except: max_order -= 1 #estimate optimal order of the pathset print('detecting the optimal order...') optimal_order = model.estimate_order() print('the optimal Markovian order of the data is ' + str(optimal_order))