def run_pagerank(self, damping_factor=0.85, iterations=10): return pagerank(self.index_file, self.edge_file, self.max_node, self.edge_count, damping_factor=damping_factor, iterations=iterations)
def test_pagerank(self): size = 1000 g = nx.DiGraph(nx.powerlaw_cluster_graph(size, 3, 0.001)) N = len(g.nodes()) tmp_file = tempfile.NamedTemporaryFile(delete=False) for node in g.nodes(): outlinks = g.out_edges(nbunch=[node]) outlinks = map(str, [n2 for n1, n2 in outlinks]) if not outlinks: value = 'pr_results,%s,%s' % (1.0/N, N) tmp_file.write('%s\t%s\n' % (node, value)) else: outlinks_str = ','.join(outlinks) value = 'pr_results,%s,%s,' % (1.0/N, N) value += outlinks_str tmp_file.write('%s\t%s\n' % (node, value)) tmp_file.flush() input_path = tmp_file.name job_id = 'unittest' sorted_ids = pagerank(job_id, self.iter_count, input_path, self.top_n) fs = HadoopFS() fs.rmr('%s/hat_results' % job_id) if self.top_n <= size: self.assertEqual(len(sorted_ids), self.top_n, 'some ids is missing') id_ranges = range(0, 1000) for _id in sorted_ids: self.assertIn(int(_id), id_ranges, 'node should in graph')
def netrank_gridsearch(network_path, diff_expr, out_path, alpha_prec=11): """Perform grid search over alpha parameter. This function will compute the netranks for a given network and differential expression data for a range of alpha parameters. Parameters: ---------- network_path: Path to the network in hdf5 container (with gene names). Gene names and expression data are assumed to be in the same order as the nodes in the adjacency matrix. diff_expr: Differential expression dataframe. If set to None, PageRank is calculated, otherwise NetRank will be used. out_path: Directory to which the results are written. alpha_prec: The number of runs to compute netrank scores. Default is 10, which corresponds to computing ranks for alpha = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]. """ alpha_range = np.linspace(0, 1, alpha_prec) print("Running grid search for alpha={}".format(alpha_range)) for alpha in alpha_range: scores, gene_names = pagerank.pagerank(network_path, diff_expr, alpha) out = os.path.join(out_path, 'netrank_alpha_{}.txt'.format(alpha)) pagerank.write_ranking(scores, gene_names, out) print("Netrank for alpha {} computed successfully!".format(alpha)) print("Grid Search successfully computed (results in {})".format(out_path))
def test_simple(self): graph_dict = {"A": {}, "B": {"A":{}, "C":{}}, "C": {"A": {}}, "D": {"A":{}, "B":{}, "C":{}} } graph = Graph(graph_dict) ranks = pagerank(graph, iterations=10) math.isclose(ranks["A"], 0.4583333)
def main(): sentences = ["i am paradox", "my name is paradox", "i love caffeine"] sentences_tokenized = list(map(lambda x: x.split(), sentences)) print(sentences_tokenized) matrix = build_sim_matrix(sentences_tokenized) print(matrix) result = pagerank(matrix) print(result)
def undirected_page_rank(q, D, p, sim, th, priors): results = {} doc_ids = q['visited_documents'] sim_graph = build_graph(cl.get_subset(D, doc_ids), sim, th) if priors == 'baseline': # if baseline tries priors regarding 1st delivery priors_vec = {doc_id: 1/(50 + rank) for doc_id,rank in q['visited_documents_orders'].items()} elif priors == 'classification': # if classification tries priors regarding 2nd delivery priors_vec = q['document_probabilities'] pr_values = {'vanilla_pk': pk.pagerank(sim_graph, max_iter=50, weight=None), 'extended_pk': pk.pagerank(sim_graph, max_iter=50, weight='weight', personalization=priors_vec)} for pr_type in pr_values: results[pr_type] = cl.get_subset(pr_values[pr_type], sorted(list(pr_values[pr_type].keys()), key=lambda x: pr_values[pr_type][x], reverse=True)[:p]) return results
def main(): parser = argparse.ArgumentParser() parser.add_argument('damping', nargs='?', help='The damping factor to use.', type=int, default=0.85) args = parser.parse_args() graph = endograph.endograph(sys.stdin.read()) scores = zip(sorted(graph.nations), pagerank.pagerank(graph.matrix(), args.damping)) print("\n".join( "%.10f %s" % (b,a) for (a,b) in sorted(scores, key=lambda z:z[1], reverse=True) ))
def test_with_iter_and_dampening(self): graph_dict = {"A": {}, "B": {"A":{}, "C":{}}, "C": {"A": {}}, "D": {"A":{}, "B":{}, "C":{}} } graph = Graph(graph_dict) ranks = pagerank(graph, iterations=0, d=1) math.isclose(ranks["A"], 0.50747) math.isclose(ranks["B"], 0.27431) math.isclose(ranks["C"], 0.1925) math.isclose(ranks["D"], 0.15)
def pagerank_aggregator(objects, threshold, alpha): """Implements the pagerank aggregation for a given alpha and epsilon. Alpha is for the bias towards surf probability, non-random in this case. Epsilon controls the convergence threshold, a small number in practice. The program first constructs the graph given the rankers and then calls pagerank function repeatedly until the average change in pagerank scores is below epsion. Then, it converts the resulting scores into a ranking. """ ### Construct graph version of the rankers and update indegrees indegrees = {} total_indegrees = 0.0 graph = {} for key in objects.keys(): graph[key] = [] indegrees[key] = 0.0 for key1 in objects.keys(): for key2 in objects.keys(): if key1 != key2: count = num_higher(objects, key1, key2) if count > 0: graph[key1].append( (key2,float(count)) ) indegrees[key2] += count total_indegrees += count ##Normalize to add the outlinks to 1 for key1 in graph.keys(): total = 0 for (key2,val) in graph[key1]: total += val for i in range(len(graph[key1])): (key2, val) = graph[key1][i] graph[key1][i] = (key2, val/total) ##Normalize indegrees as well for key in indegrees.keys(): indegrees[key] /= total_indegrees ### Call page rank final_scores = pg.pagerank(graph, indegrees, threshold, alpha) #final_scores = pg.pagerank(graph, {}, threshold, alpha) ### Convert the final scores to a ranking ranker = get_ranker_for_scores(final_scores) rankscore = kendall_tau(objects, ranker) return ranker, rankscore
def main(): parser = argparse.ArgumentParser() parser.add_argument('damping', nargs='?', help='The damping factor to use.', type=int, default=0.85) args = parser.parse_args() graph = endograph.endograph(sys.stdin.read()) scores = zip(sorted(graph.nations), pagerank.pagerank(graph.matrix(), args.damping)) print("\n".join( "%.10f %s" % (b, a) for (a, b) in sorted(scores, key=lambda z: z[1], reverse=True)))
def pagerank_simluation_test(input_path): iter_count = 5 top_n = 500 job_id = 1 sorted_ids = pagerank(job_id, iter_count, input_path, top_n) if sorted_ids: if len(sorted_ids) < 10: for i in range(len(sorted_ids)): print sorted_ids[i] else: for i in range(10): print sorted_ids[i] fs = HadoopFS() fs.rmr('%s/hat_results' % job_id)
def find_key_phrases(tokens, parts_of_speech, window): tagged_tokens = nltk.pos_tag(tokens) nodes = [] for token in tokens: nodes.append((token, 1)) edges = [] for i in range(0, len(tagged_tokens)): if tagged_tokens[i][1] in parts_of_speech: right = min(i + window, len(tagged_tokens)) for j in range(i+1, right): vertex = (tokens[i], tokens[j], {"weight": 1}) edges += [vertex] return pagerank(nodes, edges, 15)
def pagerank_rank(top_n, date, topic_id, window_size): data = [] tmp_file = prepare_data_for_pr(topic_id, date, window_size) if not tmp_file: return data input_tmp_path = tmp_file.name job_id = generate_job_id(datetime2ts(date), window_size, topic_id) iter_count = PAGERANK_ITER_MAX sorted_uids = pagerank(job_id, iter_count, input_tmp_path, top_n) print sorted_uids
def pagerankCenter(graph): if len(graph.nodes) == 1: return graph.nodes[0], 0 simMatrix = pagerank(graph, graph.nodes) sumDist = dict() minDist = float("inf") centerList = list() center = None for node in graph.nodes: sumDist[node] = sum(list(simMatrix[node].values())) if minDist > sumDist[node]: centerList = [node] minDist = sumDist[node] elif minDist == sumDist[node]: centerList.append(node) center = centerList[0] radius = nx.eccentricity(graph, v=center) return center, radius
def main(): parser = argparse.ArgumentParser(description='Running PageRank') parser.add_argument('path_file', metavar='f', type=str, help='path to file to perform page ranking') parser.add_argument('beta', metavar='b', type=float, help='allow teleportation with 1-beta probability') parser.add_argument('eps', metavar='eps', type=float, help='epsilon value for convergence') parser.add_argument('n_nodes', metavar='nodes', type=int, help='number of unique nodes') parser.add_argument('n_edges', metavar='edges', type=int, help='number of all edges') parser.add_argument('pow_iter', metavar='pow_iter', type=int, help='number of iterations') args = parser.parse_args() file_path = args.path_file beta = args.beta eps = args.eps n_nodes = args.n_nodes n_edges = args.n_edges num_iterations = args.pow_iter result = pagerank(file_name=file_path, beta=beta, power_iterations=num_iterations, num_edges=n_edges, num_nodes=n_nodes, eps=eps)
def pagerank_rank(top_n, date, topic_id, window_size): data = [] tmp_file = prepare_data_for_pr(topic_id, date, window_size) if not tmp_file: return data input_tmp_path = tmp_file.name job_id = generate_job_id(datetime2ts(date), window_size, topic_id) iter_count = PAGERANK_ITER_MAX sorted_uids = pagerank(job_id, iter_count, input_tmp_path, top_n) topicname = acquire_topic_name(topic_id) if not topicname: return data data = save_rank_results(sorted_uids, 'topic', 'pagerank', date, window_size, topicname) return data
for row in reader: all_realted_paper_li.append(row[0]) print "all_realted_paper_li create over" print "len(all_realted_paper_li):", len(all_realted_paper_li) # 读取作者前五年论文 paper5_set = set() with open(r'..\result\paper5.csv', 'rb') as csvfile: reader = csv.reader(csvfile) for row in reader: if row[0] in all_realted_paper_li: paper5_set.add(row[0]) print "paper5_set and paper5_li create over" print "len(paper5_set):", len(paper5_set) with open(r'..\g_result\all_information_2.csv', 'rb') as csvfile: reader = csv.reader(csvfile) for row in reader: if row[0] in paper5_set: graph.add_node(row[0]) cite_paper_li = row[7][2: -2].split('\', \'') for cite_paper in cite_paper_li: if cite_paper in all_realted_paper_li: graph.add_edge(row[0], cite_paper) print "graph create over" print graph.number_of_nodes() print graph.number_of_edges() rank_value = pagerank.pagerank(graph, max_iteration=1000) pagerank.save_pagerank_value(rank_value, r'..\result\paper_unweighted_pagerank.csv')
def crawler(): try: conn = sqlite3.connect(db_file) conn.text_factory = str pool = threadpool.ThreadPool(16) # Init keywords with open(word_file, "rb") as f: keywords = pickle.load(f) with open(searched_word_file, "rb") as f: searched = pickle.load(f) while True: keywords_list = list(keywords) while len(keywords_list) != 0: init_vals = [] for k in keywords_list[:key_batch_size]: for site in search_sites: for search_page in site['search_page']: init_vals.append(([ site['domain'], search_page, site['header'], k ], None)) threqs = threadpool.makeRequests(proc_search, init_vals) [pool.putRequest(req) for req in threqs] pool.wait() print("Commiting...") cursor = conn.cursor() for cmd in webpage_cmds: try: cursor.execute(cmd[0], cmd[1]) except: pass for cmd in word_cmds: try: cursor.execute(cmd[0], cmd[1]) except: pass for cmd in ref_cmds: try: cursor.execute(cmd[0], cmd[1]) except: pass cursor.close() conn.commit() webpage_cmds.clear() ref_cmds.clear() word_cmds.clear() print("Ranking and indexing...") pagerank() index() print("Saving keywords...") for k in keywords_list[:key_batch_size]: searched.add(k) keywords_list = keywords_list[key_batch_size:] with open(word_file, "wb") as f: pickle.dump(set(keywords_list), f) with open(searched_word_file, "wb") as f: pickle.dump(searched, f) print("Done.") keywords = new_key.copy() new_key.clear() print("Iterator Done.") except Exception as e: print(e) finally: conn.close()
''' Run PageRank algorithm on the moon landing webgraph ''' from graph import adjacency_list from pagerank import pagerank from results import output_rank_json, output_rank_csv, output_random_walks from randomwalks import random_walks # Parse dataset into adjacency matrix adj_list = adjacency_list("data/adj_list") # Non scaled pagerank rank = pagerank(adj_list, 10) # output results of basic pagerank output_rank_json("data/nodes", rank, "out/unscaledranking.json") output_rank_csv("data/nodes", rank, "out/unscaledranking.csv") # Scaled pagerank s = 0.85 scaled_rank = pagerank(adj_list, 10, scaled=True) # output results of scaled pagerank output_rank_json("data/nodes", scaled_rank, "out/scaledranking.json") output_rank_csv("data/nodes", scaled_rank, "out/scaledranking.csv") # Scaled pagerank s = 0.7 scaled_rank = pagerank(adj_list, 10, scaled=True, s=0.7) output_rank_csv("data/nodes", scaled_rank, "out/scaledranking7.csv") # Scaled pagerank s = 0.5 scaled_rank = pagerank(adj_list, 10, scaled=True, s=0.5) output_rank_csv("data/nodes", scaled_rank, "out/scaledranking5.csv")
def epidemic_propagation( adjencyMatrix, scenario="noVaccination", initContamination=0.05, initVaccination=0.12, contaminationRate=0.2, cureRate=0.26, alpha=0.85, epsilon=0.00001): """ Computes the propagation of an epidemy Args: adjencyMatrix: The adjency matrix of the graph scenario: The scenario to execute (noVaccination, randomVaccination, specificVaccination) initContamination: X% of the initially infected people initVaccination: Y% of the initially vaccinated people contaminationRate: % of chance to contamine a neighbour cureRate: Self cure rate alpha: Dumping factor Returns: """ size = adjencyMatrix.shape[0] # Vaccinate people if scenario == "PRVaccination": # PageRank vaccination (Most important nodes are vaccinate) vaccinated_nodes = pagerank(adjencyMatrix, alpha, epsilon)[: int(size * initVaccination)] elif scenario == "randomVaccination": # Random vaccination vaccinated_nodes = np.random.choice(size, int(size * initVaccination), replace=False) else: # No vaccination vaccinated_nodes = [] adjencyMatrix = np.delete(adjencyMatrix, vaccinated_nodes, 0) adjencyMatrix = np.delete(adjencyMatrix, vaccinated_nodes, 1) # Select randomly nodes that are initially infected size = adjencyMatrix.shape[0] contam_vector = np.random.choice(size, int(size * initContamination)) # Infection ratio is the parameter given res = [contaminationRate] # Iterations for i in range(200): # Get the neighbours of infected nodes neighbours = adjencyMatrix[contam_vector, :].nonzero()[1] # Infect neighbour (contaminationRate probability) contam_vector = np.unique( np.concatenate( ( contam_vector, np.extract(np.random.rand(neighbours.shape[0]) < contaminationRate, neighbours), ) ) ) # Infect non-neighbour (1-alpha probability) non_neighbours = np.array(list(set(np.arange(0, size)) - set(contam_vector) - set(neighbours))) if True and len(non_neighbours) > 0: contam_vector = np.unique( np.concatenate( ( contam_vector, np.extract( np.random.rand(non_neighbours.shape[0]) < (1 - alpha) / non_neighbours.shape[0], non_neighbours ), ) ) ) # Cure contam_vector = np.delete( contam_vector, np.where(np.random.rand(contam_vector.shape[0]) < cureRate)[0] ) # Get the number of infected individuals res.append(float(len(contam_vector))) print("Results : ", list(enumerate(res))) return zip(*list(enumerate(res)))
dataFile = open(self.data_file) for line in open(self.data_file): A=line.strip().split('\t')[0] B=line.strip().split('\t')[1] self.List.append(A) self.List.append(B) dataFile.close() if __name__ == '__main__': PRdata = PRdata("data/WikiData.txt") dg = Graph.Graph() dg.add_nodes(PRdata.List) with open("data/WikiData.txt","r") as f: for line in f: A=line.strip().split('\t')[0] B=line.strip().split('\t')[1] dg.add_edge((A, B)) f.close() #print(dg.node_n) pr = pagerank.pagerank(dg) page_ranks = pr.page_rank() print("The final page rank is saving...") with open("results/result.txt","w") as f_w: for key, value in page_ranks.items(): #print(key + " : ", value[0]) f_w.write(key + '\t\t' + str(value[0]) + '\n') f_w.close() print("The final page rank is okey.") input('请按任意键退出...')
nargs=1, help='input file') args = parser.parse_args() print('executing with damping factor {}, epsilon {}'.format( args.d, args.e)) # read the file print('constructing graph...') g = load_graph(args.ifile[0]) print('{} nodes loaded.'.format(g.nNode)) # print('start pageranking...') startTime = time.perf_counter() pg = pagerank.pagerank(g, args.d, args.e) endTime = time.perf_counter() print('Time spent: {}s'.format(endTime - startTime)) # print('store result...') save_pagerank(pg, args.o)
def textrank(sentences_tokenized, topn=5): matrix = build_sim_matrix(sentences_tokenized) ranks = pagerank(matrix) return matrix, ranks
parser.add_argument('ifile', metavar='input-file', nargs=1, help='input file') args = parser.parse_args() print('executing with damping factor {}, epsilon {}'.format( args.d, args.e)) # read the file print('constructing graph...') g = load_graph(args.ifile[0]) print('{} nodes loaded.'.format(g.nNode)) # print('start pageranking...') startTime = time.perf_counter() pg = pagerank.pagerank(g, args.d, args.e) endTime = time.perf_counter() print('Time spent: {}s'.format(endTime-startTime)) # print('store result...') save_pagerank(pg, args.o)
import graph_generate as gg import matrix_generate as mg import pagerank as pr if __name__ == '__main__': dg = gg.create_network() adj_matrix = mg.create_matrix(dg) pr_vec = mg.create_pr_vector(dg) pr.pagerank(adj_matrix, pr_vec)
lambda x: (x['page_id'], x['title'], x['url'], x['content'])) (documents_rdd.distinct().map(lambda (p, t, u, c): { "page_id": p, "title": t, "url": u, "content": c }).map(lambda x: json.dumps(x)).saveAsTextFile(DOCUMENTS_SAVE_PATH)) print 'start get pagerank' pageID_rdd = barrel_rdd.map(lambda x: x['page_id']) title_to_pageID = (barrel_rdd.map( lambda x: (x['title'], x['page_id'])).distinct().collectAsMap()) links_rdd = (load_local.pagelinksToDataframe(sc).rdd.map( lambda x: (x.pl_from, title_to_pageID.get(x.pl_title, -1)))) ranks_rdd = pagerank.pagerank(pageID_rdd, links_rdd, 5) print 'start get tf' tf_rdd = (barrel_rdd.flatMap(lambda x: x['words_with_meta']).map( lambda (word, (page_id, is_title, tf)): (word_to_wordID[word], (page_id, tf))).cache()) print 'start get df' df_rdd = (barrel_rdd.flatMap(lambda x: x['unique_words']).map( lambda word: (word_to_wordID[word], 1)).reduceByKey(add).cache()) print 'start get tfidf' tfidf_rdd = ( tf_rdd.join(df_rdd).map(lambda (word_id, ((page_id, tf), df)): ( (word_id, page_id), round(tf / float(df), 5))).cache())
all_realted_paper_li.append(row[0]) print "all_realted_paper_li create over" print "len(all_realted_paper_li):", len(all_realted_paper_li) # 读取作者前五年论文 paper5_set = set() with open(r'..\result\paper5.csv', 'rb') as csvfile: reader = csv.reader(csvfile) for row in reader: if row[0] in all_realted_paper_li: paper5_set.add(row[0]) print "paper5_set and paper5_li create over" print "len(paper5_set):", len(paper5_set) with open(r'..\g_result\all_information_2.csv', 'rb') as csvfile: reader = csv.reader(csvfile) for row in reader: if row[0] in paper5_set: graph.add_node(row[0]) cite_paper_li = row[7][2:-2].split('\', \'') for cite_paper in cite_paper_li: if cite_paper in all_realted_paper_li: graph.add_edge(row[0], cite_paper) print "graph create over" print graph.number_of_nodes() print graph.number_of_edges() rank_value = pagerank.pagerank(graph, max_iteration=1000) pagerank.save_pagerank_value(rank_value, r'..\result\paper_unweighted_pagerank.csv')
def pagerank(self): return pagerank(self.tomatrix())
parsed_page_rdd = sc.textFile(load.PARSED_PAGE_PATH).map(util.encode) parsed_page_rdd.cache() docid_words = parsed_page_rdd.keys() \ .map(lambda x: (x['url'], x['words'])) \ .join(url_docid) \ .map(lambda (url, (words, doc_id)): (doc_id, words)) links_rdd = parsed_page_rdd.values().flatMap(lambda x: x) word_word_id = sc.textFile(load.WORD_LIST_PATH).map( util.encode).map(lambda x: (x['word'], x['word_id'])) print 'start links' print 'start get pagerank' ranks_rdd = pagerank.pagerank(doc_id_rdd, links_rdd, 5) # Create tf idf and inverted index MINIMUM_VALID_DF = 10 words_with_meta = docid_words.join(ranks_rdd) \ .flatMap(lambda (doc_id, (words, rank)): [(word, (doc_id, meta, rank)) for (word, meta) in words]) \ .join(word_word_id) \ .map(lambda (word, ((doc_id, meta, rank), word_id)): ((word_id, word), (doc_id, meta, rank))) \ .groupByKey() \ .map(lambda (word_with_id, docs): (word_with_id, docs, len(docs))) \ .filter(lambda (word_with_id, docs, df): df > MINIMUM_VALID_DF) words_with_meta.cache() words_with_meta.map(lambda ((word_id, word), docs, df): (word_id, [(doc_id, h, s, t, round(tf * df, 5), rank) for (doc_id, (h, s, t, tf), rank) in docs])) \ .map(lambda (word_id, docs): (word_id, [{"doc_id": doc_id, "tfidf": tfidf, "header": h, "style": s, "title": t, "pagerank": r} for (doc_id, h, s, t, tfidf, r) in docs])) \ .flatMap(lambda (word_id, docs): [(word_id, i, sub_docs) for i, sub_docs in enumerate(list(chunked(docs, int(math.ceil(len(docs)/float(DIVIDE_INDEX))))))]) \
import csv from lematization import lemm_str from pagerank import pagerank import operator def search(query: str, index_csv_path: str) -> [str]: lemms = lemm_str(query) doc_lists = [] with open(index_csv_path) as f: reader = csv.DictReader(f, delimiter=',') for line in reader: for word in lemms: doc_lists.append(ast.literal_eval(line.get(word, '[]'))) result = set() if len(doc_lists) > 0: result.update(set(doc_lists[0])) for docs in doc_lists: result.intersection_update(docs) return list(result) if __name__ == "__main__": query = input("Enter your request:") answers = search(query, "./inverted_index.csv") for item in pagerank(answers): print(item)
all_paper_list = list() with open(r'..\result\paper5.csv', 'rb') as csvfile: reader = csv.reader(csvfile) for row in reader: all_paper_list.append(row[0]) graph = nx.DiGraph() with open(r'..\g_result\all_information_2.csv', 'rb') as csvfile: reader = csv.reader(csvfile) for row in reader: if row[0] in all_paper_list: if len(row) == 8: cite_paper_str = row[7] if cite_paper_str == '[]': graph.add_node(row[0]) else: cite_paper_li = cite_paper_str[2: -2].split('\', \'') for cite_paper in cite_paper_li: graph.add_edge(cite_paper, row[0]) else: graph.add_node(row[0]) print "graph create over" rank_value_dic = pagerank.pagerank(graph, 1000) file_path = r'..\result\paper_pagerank_value.csv' pagerank.save_pagerank_value(rank_value_dic, file_path)
from graph import Node, Graph from pagerank import pagerank a = Node() b = Node() c = Node() d = Node() g = Graph() g.add_node('a', a) g.add_node('b', b) g.add_node('c', c) g.add_node('d', d) g.add_edge('b', 'c') g.add_edge('b', 'a') g.add_edge('c', 'a') g.add_edge('d', 'a') g.add_edge('d', 'b') g.add_edge('d', 'c') ranks = pagerank(g) for node, value in ranks.items(): print(node.name, value)
for line in f: line = line.strip().split(" ") n_0 = int(line[0]) path_count = int(line[-1]) start_nodes[n_0] += path_count total_number_of_items_manufactured += path_count start_nodes = {k:v/total_number_of_items_manufactured for k,v in start_nodes.items()} # CALCULATE PAGERANK FOR DIFFERENT CONDITIONS # =========================================== # Case 1: Original algorithm E = {k:1 for k,v in edges.items()} start_nodes_all = {n:1/len(G.nodes) for n in G.nodes} PR_1 = pagerank.pagerank(E, list(G.nodes), start_nodes_all, B=0.85) # Case 2: Edge weights considered PR_2 = pagerank.pagerank(edges, list(G.nodes), start_nodes_all, B=0.85) # Case 3: Edge weights and start nodes considered PR_3 = pagerank.pagerank(edges, list(G.nodes), start_nodes, B=0.85) # Print table for LaTeX for k,v in PR_1.items(): print(k, " & ", round(v,3), " & ", round(PR_2[k],3), " & ", round(PR_3[k],3), "\\\\") # PLOT RESULTS # ============ x = list(PR_1.keys())
sys.path.append('data/movie-actor/') from make_casting_graph import oneway_to_bidirected_graph g = oneway_to_bidirected_graph(graph) from pagerank import pagerank, jypagerank print() print( '--------------------------- TOP RESULT by PAGERANK ALGORITHM ---------------------------' ) print() start = time.time() rank = pagerank(g, bias=None, df=0.15, max_iter=30, converge_error=0.001, verbose=1) # top rank movie # filtering Korean movie movie_rank = {node: rank for node, rank in rank.items() if node[0] == 'm'} actor_rank = {node: rank for node, rank in rank.items() if node[0] == 'a'} korean_movies = { movie: weight for movie, weight in movie_rank.items() if '한국)' in idx2movie(movie.split()[1]) } list1 = []
import numpy as np size = 100 h = np.arange(size * size).reshape(size, size).astype(np.float) # every column added to 1.0 h = h / h.sum(axis=0) # save to 3 files np.save('h_0_30.npy', h[0:30]) np.save('h_30_60.npy', h[30:60]) np.save('h_60_100.npy', h[60:100]) import pagerank x = pagerank.pagerank('fn_chunk.txt', size) print x
sys.exit(2) for o, a in opts: if o == '-i': # input directory input_directory = a elif o == '-d': # dictionary file output_file_dictionary = a elif o == '-p': # postings file output_file_postings = a else: assert False, "unhandled option" if input_directory == None or output_file_postings == None or output_file_dictionary == None: usage() sys.exit(2) G, url_map, doc_id_map = crawler(input_directory) to_remove = list() for node in G.nodes(): keep = False for doc_id, url_nb in doc_id_map.items(): if node == url_nb: keep = True if not keep: to_remove.append(node) G.remove_nodes_from(to_remove) pr_result = pagerank(G) #print(pr_result) build_index(input_directory, output_file_dictionary, output_file_postings)
import networkx as nx import pagerank all_paper_list = list() with open(r'..\result\paper5.csv', 'rb') as csvfile: reader = csv.reader(csvfile) for row in reader: all_paper_list.append(row[0]) graph = nx.DiGraph() with open(r'..\g_result\all_information_2.csv', 'rb') as csvfile: reader = csv.reader(csvfile) for row in reader: if row[0] in all_paper_list: if len(row) == 8: cite_paper_str = row[7] if cite_paper_str == '[]': graph.add_node(row[0]) else: cite_paper_li = cite_paper_str[2:-2].split('\', \'') for cite_paper in cite_paper_li: graph.add_edge(cite_paper, row[0]) else: graph.add_node(row[0]) print "graph create over" rank_value_dic = pagerank.pagerank(graph, 1000) file_path = r'..\result\paper_pagerank_value.csv' pagerank.save_pagerank_value(rank_value_dic, file_path)