def __init__(self): myCrawler = Crawler(self.LINKS) crawledURLs = myCrawler.getVisited() linkStructure = myCrawler.getLinkStructure() print("Link-Struktur:\n") myCrawler.printLinkStructure() myPageRank = PageRank(linkStructure) pageRanks = myPageRank.getPageRank() print("\n\nPageRanks:\n") myPageRank.printPageRank() myIndex = Index(self.STOPWORDS, crawledURLs) index = myIndex.getIndex() print("\n\nIndex:\n") myIndex.printIndex() myScorer = Scorer(pageRanks, index,linkStructure) #myScorer.usePageRank(True) print("\n\nDokumentenlängen:\n") myScorer.printDocumentLengths() print("\n\nSuchergebnisse:\n") myScorer.calculateScores(["tokens"]) myScorer.calculateScores(["index"]) myScorer.calculateScores(["classification"]) myScorer.calculateScores(["tokens", "classification"])
def get_mr_job(iteration, threshold): '''Returns MRJob depending on iteration First iteration uses the input file, every iteration after that will use the previous iterations output until Rank converges PARAMETERS ---------- iteration: int current PageRank iteration threshold: float Defines convergence threshold for rank RETURNS ------- PageRank: MRJob PageRank job ''' output = f'--output-dir={OUTDIR}{iterations}' if not iterations: return PageRank([json_file, output, threshold]) else: input_dir = f'{OUTDIR}{iterations-1}' return PageRank([input_dir, output, threshold])
def test_mapper(self): mr_input = data['page_rank']['mapper_input'] results = {} mr_job = PageRank(['./test_input.json', '--rank-threshold=0.1']) results = [(k, v) for input_key, input_val in mr_input.items() for k, v in mr_job.mapper(input_key, input_val)] expectation = [(k, v) for k, v in data['page_rank']['mapper_output']] differences = [item for item in results if item not in expectation] self.assertEqual(len(differences), 0)
def page_rank_util(self): # print('Starting Page rank...........') pagerank = PageRank(self.random_walk, self.teleportation) final_steady_state = pagerank.get_final_steady_state() # print(final_steady_state) for i in range(len(self.images_list)): self.page_ranking[self.images_list[i]] = final_steady_state[i][0] # Ordering the page ranking based on the rank sorted_pagerank = sorted(self.page_ranking.items(), key=lambda kv: kv[1], reverse=True) self.page_ranking = dict(collections.OrderedDict(sorted_pagerank))
def __init__(self, file_name): """Creates a search engine backed by PageRank and TF-IDF Args: file_name: path to xml files of wiki dump """ # build corpus from xml files self.corpus, self.links = build_corpus(file_name) self.tf_idf = TFIDF(self.corpus) print("TFIDF engine has started") self.reverse_index = {word: set(mapping.keys()) for word, mapping in self.tf_idf.tf_idf.items()} self.page_rank = PageRank(self.links, self.tf_idf.tf_idf) print("PageRank engine has started")
def big_graph(file_path): graph = PageGraph(file_path) graph.fetch_graph() page_ranker = PageRank(graph) page_ranker.rank("big") print("Top 50 pages sorted by PageRank:") page_ranker.sort_by_pr() print("Top 50 pages sorted by in-link count:") page_ranker.sort_by_inlink()
class SearchEngine: """ SearchEngine determines certain search engine based on the user's choice and returns the score of query words. """ def __init__(self, file_name): """Creates a search engine backed by PageRank and TF-IDF Args: file_name: path to xml files of wiki dump """ # build corpus from xml files self.corpus, self.links = build_corpus(file_name) self.tf_idf = TFIDF(self.corpus) print("TFIDF engine has started") self.reverse_index = {word: set(mapping.keys()) for word, mapping in self.tf_idf.tf_idf.items()} self.page_rank = PageRank(self.links, self.tf_idf.tf_idf) print("PageRank engine has started") def search(self, query, mode, limit=10): """Sends `process_text(query)` to the search engines selected by `mode` and returns article titles and associated scores up to `limited`. Results are sorted by their scores in a descending order. Args: query: raw query string mode: 'TF-IDF|PageRank|smart' limit: int Returns: A list of tuples. Each tuple is a document title and score pair. """ keywords = process_text(query) # process a raw query string to a cleaner version, remove # all the punctuations and white spaces if mode == 'TF-IDF': return self.tf_idf.search(keywords, limit) elif mode == 'PageRank': return self.page_rank.search(keywords, limit) elif mode == 'smart': return self.smart_search(keywords, limit) raise ValueError('Undefined search mode') def smart_search(self, keywords, limit=None): """ Returns the score of certain query words based on TFIDF score and pagerank score. """ smart_scores = {} tf_idf = self.tf_idf.tf_idf page_rank = self.page_rank.page_rank for word in keywords: if word in self.reverse_index: for page in self.reverse_index[word]: if page not in smart_scores: smart_scores[page] = 0 smart_scores[page] += tf_idf[word][page] + page_rank[page] result = sorted(smart_scores.items(), key=lambda x: x[1], reverse=True) return result[:limit]
def test_reducer(self): map_output = data['page_rank']['mapper_output'] mr_input = defaultdict(list) for key, val in map_output: mr_input[key].append(val) results = {} mr_job = PageRank(['./test_input.json', '--rank-threshold=0.1']) results = [(k, v) for map_key, map_val in mr_input.items() for k, v in mr_job.reducer(map_key, map_val)] expectation = [(k, tuple(v)) for k, v in data['page_rank']['reducer_output']] differences = [item for item in results if item not in expectation] self.assertEqual(len(differences), 0)
def run(edge_file, node_num, beta=0.85, epsilon=1e-6, max_iterations=20): """Calls various ranking functions and print the rank_vectors. Parameters ---------- edge_file : string Path to the file where edges of web-graph are stored. node_num : int Number of nodes in the web-graph. beta : float, optional Probability with which teleports will occur. Default value : 0.85 epsilon : float, optional A small value and total error in ranks should be less than epsilon. Default value : 1e-6 max_iterations : int, optional Maximum number of times to apply power iteration. Default value : 20 Returns ------- None """ gg = getGraph(edge_file) edges = gg.get_connections() print("got edges...") pr = PageRank(beta, edges, epsilon, max_iterations, node_num) PageRank_vector = pr.pageRank() print(PageRank_vector, sum(PageRank_vector)) tr = TrustRank(beta, edges, epsilon, max_iterations, node_num, PageRank_vector) TrustRank_vector = tr.trustRank() print(TrustRank_vector, sum(TrustRank_vector))
def main(): crawler = Crawler([ "http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d01.html", "http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d06.html", "http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d08.html" ]) crawler.crawl() bank = crawler.get_bank() bank.sortBank() print '\nLinkstruktur: \n' bank.printOutgoing() print '\nPageRanks:' rank = PageRank(bank, 0.95, 0.04) rank.calculate() print '\n\nIndex: \n' i = Index( bank ) i.printIndex() s = Scorer( 'tokens', i ) print '\nDokumentenlaenge: \n' s.printDocumentLength() print '\nSuchergebnisse: \n' s.printScoring() s = Scorer( 'index', i ) s.printScoring() s = Scorer( 'classification', i ) s.printScoring() s = Scorer( 'tokens classification', i ) s.printScoring()
def test_graph_score(): g = Graph() g.insert_edge(0, 1) g.insert_edge(0, 4) g.insert_edge(1, 2) g.insert_edge(1, 3) g.insert_edge(2, 0) g.insert_edge(3, 2) g.save_graph() p = PageRank(g) p.iterate(max_iter=None) p.save_score() return True
def main(content_path='Contents', graph_path='Data/graph.p', bir_name=None): if not bir_name: # Create BIR bir = BIR(normalization_factor=2) else: with open("{}/{}".format('Data', bir_name), 'rb') as f: bir = p.load(f) n_file = 0 for filename in os.listdir(content_path): if not filename.startswith('.'): idx = int(filename[:-4]) # Content file must be stored as doc_id.txt with open("{}/{}".format(content_path, filename), 'r') as f: if n_file % 100 == 0: print("Have Parsed {} documents".format(n_file)) kw = extract_keywords(f.read()) bir.insert_document(doc=kw, idx=idx) n_file += 1 # Calculate and save tf-idf table print("Create BIR and tf-idf for all documents...") bir.create_and_save_tf_idf(filename='tf_idx.p', path=os.getcwd()) # Save the BIR to a pickle file print("Saving Inverted Index Table...") bir.save(path=os.getcwd()) # Upload the web graph with open(graph_path, 'rb') as f: graph = p.load(f) print("Calculate PageRank ...") # Build PageRank using the web graph pagerank = PageRank(graph, prev_path=None, damping_factor=0.32, epsilon=0.0000001, default_weight=None) # Iterate until converge pagerank.iterate(max_iter=100000) # Save the PageRank score pagerank.save_score(filename=None, path=os.getcwd()) print("Finished!!!") return True
from MakeDataSimple import MakeDataSimple from PageRank import PageRank from SortedPageTitleByPageRank import * import time if __name__ == '__main__': start = time.time() page_links_reader = ReadPageLinksFile('./viwiki-20170901-pagelinks.sql') page_title_reader = ReadPageTitleFile('./viwiki-20170901-page.sql') make_data_simple = MakeDataSimple() print("-----Start Page Title Reader-----") page_title_reader.start() n_page = page_title_reader.get_total_field() print("-----Start Page Links Reader-----") page_links_reader.start() print("-----Make Data Simple-----") make_data_simple.start() print("-----Calculate Page Rank------") page_rank = PageRank(n_page=n_page, max_iterator=100, n_thread=6) page_rank.start() print("-----Sort page title by page rank------") write_sorted_page_title_by_page_rank() print("n_page {}".format(n_page)) print("Total time: {}".format(time.time() - start))
from Database import WikiDb from PageRank import PageRank import math print "Added db init" def weight_function(frequency, tf, N): return math.log(float(N) / frequency + 1) * tf article_json = [ ('wiki_json/female_explorers.json', 'Female Explorers'), ('wiki_json/women_nobel_laureates.json', 'Women Nobel Laureates'), ('wiki_json/women_computer_scientists.json', 'Women Computer Scientists'), ('wiki_json/women_company_founders.json', 'Women Company Founders'), ('wiki_json/women_prime_ministers.json', 'Women Prime Ministers'), ] article_db = WikiDb(article_json) lookup_table = PageRank(weight_function, len(article_db.article_id_to_metadata)) print "populating database" for entry in article_db.db_entries(): lookup_table.populate(article_db.get_article_content_by_id(entry), entry) print "populated database" # Now you can query article_db and lookup_table using the standard APIs.
from sklearn.metrics import cohen_kappa_score from costcla.models import CostSensitiveDecisionTreeClassifier CONFIG_FILE = Commons.readConfigFile() datasetFilePath = CONFIG_FILE["dataset-path"] outputFilePath = CONFIG_FILE["output-path"] hostGraphFileName = datasetFilePath + CONFIG_FILE["host-graph-file"] NUM_NODES = 114529 # from analysis before SPAM_LABEL = 0 NON_SPAM_LABEL = 1 ''' 1. Construct Graph From File ''' graph = Commons.constructGraph(hostGraphFileName) pr = PageRank(graph) ''' 1. Run Page Rank 2. Pickle Page-Rank Dictionary ranks = pr.pageRank(None) filename = outputFilePath + CONFIG_FILE["page-rank-file"] # pr.savePageRanksToDisk(filename, ranks) ''' ''' TRUST RANK 1. Extract Seeds from Training File to be used as Preference Vector (1 if node is non-spam else 0) Normalization of this vector done inside page-rank comoutation 2. Run Page Rank with Preference Vector = Trust Rank (with Dampening and Splitting) 3. Pickle Trust Ranks
def rank(): pg = PageRank("hollins.dat") write(pg.run(.85), "output1.txt") write(pg.run(.95), "output2.txt") write(pg.run(.5), "output3.txt")
from PageRank import PageRank import numpy as np # بهترین مقدار برای alpha=0.85 pagerank=PageRank(0.001,0.85) adjacency_matrix=pagerank.input_array() #تمرین سوال 1 adjacency_matrix = np.array([[0., 1., 0., 0., 0., 1.], [0., 0., 1., 0., 0., 1.], [0., 0., 0., 0., 1., 0.], [0., 1., 0., 0., 1., 0.], [0., 0., 0., 0., 0., 0.], [0., 0., 1., 1., 0., 0.]]) adjacency_matrix_T=pagerank.Transpose_Matrix(adjacency_matrix) print(adjacency_matrix_T) Convert_to_markov=pagerank.Spars_Matrix(adjacency_matrix_T) print(Convert_to_markov) Sparse_matrix=pagerank.Spars_Matrix(Convert_to_markov) print(Sparse_matrix) v = np.zeros(adjacency_matrix_T.shape[0]).reshape(adjacency_matrix_T.shape[0], -1) v_sparce=pagerank.Spars_Matrix(v) e = np.ones(adjacency_matrix_T.shape[0]).reshape(adjacency_matrix_T.shape[0], -1)
def small_graph(file_path): graph = PageGraph(file_path) graph.fetch_graph() page_ranker = PageRank(graph) page_ranker.rank("small")
print "Loading ", with open("Wiki-Vote.txt", mode='r') as data_file: for line in data_file: if line[0] == '#': continue line = line.replace('\n', '') ij_list = line.split('\t') i = index_map.getIndex(int(ij_list[0])) j = index_map.getIndex(int(ij_list[1])) A_t[j, i] = 1 A[i, j] = 1 print "is finished." print " [1] : SCC .............................................. " scc = SCC(A) A_temp = scc.removeDeadEnds() print "> PageRank started..." pr = PageRank(beta=0.8, max_err=0.0001) pr.initTransposedMat(A_temp.transpose()) pr.normalize() iter_count = pr.run() print "PageRank finished." print "> Propagate scores of PageRank" v = scc.computeRanks(pr.v) indexes = np.array(v).argsort()[-10:][::-1] print "Best nodes:" for index in indexes: print '\t', index_map.nodes[index], '\t', v[index] print " [1]; ................................................... " # print ""
line = line.replace('\n', '').split(',') if line[0] == 'id': continue G.node[line[0]]["cluster"] = int(line[1]) cluster_file.close() """ Parameters """ taxation = 0.2 tol = 1e-5 # path to twitter graph file graph = "twitter_combined.txt" ## PageRank pg = PageRank(graph) pg_value = pg.basic_pagerank(taxation, tol) t_pg_value = pg.tensor_pagerank(topic, taxation, tol) ## HITS hits = HITS(graph) hits_h_value, hits_a_value = hits.basic_hits(tol) t_hits_h_value, t_hits_a_value = hits.tensor_hits(topic, taxation, tol) ## Find the top influential nodes based on different pg_value_rank = np.argsort(-pg_value) t_pg_value_rank = [] for i in range(len(t_pg_value)): t_pg_value_rank.append(np.argsort(-t_pg_value[i])) hits_h_value_rank = np.argsort(-hits_h_value)
) print( "Since there is no pre-crawled data, do you want to crawl from the beginning ?" ) choice = input( "CAUTION!!! Crawling can run for hours! Your choice ? (y/n) :") if choice == 'y': baseurl = input( "Enter the start web-page to initiate crawl from (eg. https://www.cs.uic.edu) : " ) maxP = int( input( "Enter the max number of pages you want to try downloading: ")) pagerank = PageRank() spider = NoogleSpider(baseurl, pagerank, maxPages=maxP) spider.crawl() print( "Pickling the web structure as 'prankNoScores' file for future pagerank calculation..." ) with open('prankNoScores', 'wb') as outf: pickle.dump(pagerank, outf) else: exit() actPages = set(pagerank.adjList.keys()) tPages = pagerank.pages & actPages flag = input("Do you want to run the pagerank iteration ? (y/n): ")
# generate outputs to hdfs temp = total_deg_rdd.map(ut.toTSVLine).coalesce(1) temp.saveAsTextFile(output_file_path + 'total_degree') if graph_statistics.getTotalDeg_vs_Count(): output_rdd = deg.statistics_compute(D, 'total') deg_vs_count_rdd = deg.deg_vs_count(output_rdd) # generate outputs to hdfs temp = deg_vs_count_rdd.map(ut.toTSVLine).coalesce(1) temp.saveAsTextFile(output_file_path + 'deg_vs_count') ''' PageRank ''' pr = PageRank() if graph_statistics.getPR(): pr_rdd = pr.statistics_compute(D, Iter, 0.85, debug_mod) # generate outputs to hdfs temp = pr_rdd.map(ut.toTSVLine).coalesce(1) temp.saveAsTextFile(output_file_path + 'pagerank') if graph_statistics.getPR_vs_Count(): pr_rdd = pr.statistics_compute(D, Iter, 0.85, debug_mod) [centers, counts] = pr.pr_vs_count(pr_rdd, N) centers = sc.parallelize(centers) counts = sc.parallelize(counts) pr_vs_count = centers.zip(counts)
from PageRank import PageRank from MyWoosh import MyWoosh from collections import Counter searchQuery = "red" mixedResult = Counter() pg = PageRank("aula04_links.txt", 0.1) mw = MyWoosh("aula03_cfc.txt") #mw.createIndex() print "Converged in ", pg.runUntilConvergence(), "Iterations" search = mw.searchWord(searchQuery) for doc in search.viewkeys(): mixedResult += Counter({doc : search[doc] * pg.getScoreOfDocument(doc)*pg.numVertices}) print mixedResult.most_common(5)
from PageRank import PageRank nodes = ['A', 'B', 'C', 'D', 'E', 'F', 'G'] edges = [ ('A','B'), ('A','C'), ('B','C'), ('B', 'D'), ('C', 'D'), ('D', 'C'), ('E', 'D'), ('F', 'D'), ('E', 'F'), ('F', 'E'), ('G', 'A'), ('A', 'G'), ('C', 'G'), ('B', 'G'), ] pagerank = PageRank(nodes, edges) for node, rank in pagerank.ranking(): print str((node,rank))
from IRModel import Vectoriel, ModeleLangue, Okapi from PageRank import PageRank parser = Parser() parser.buildDocCollectionSimple("data\cisi\cisi.txt") docs = parser.getListDocs() #creation des index indexSimpler = IndexSimpler(parser.getCollection()) indexSimpler.indexation() indexSimpler.indexation_tf_idf() indexSimpler.indexationHyperLinks() #chargement des requetes queryParser = QueryParser() queryCollection = queryParser.buildCollectionQuery("data\cisi\cisi.qry", "data\cisi\cisi.rel") query = queryCollection[1] weighter1 = Weighter.Weighter1(indexSimpler) vectoriel = Vectoriel(indexSimpler, weighter1, normalized=True) #------------------------------------test du Page Rank--------------------- pageRank = PageRank(vectoriel, weighter1, n=5, k=3, d=0.85) listDocs = pageRank.get_scores(query.getText(), max_iter=100) print("Page rank: liste des documents avec leur score : ", listDocs[:20])
# generate outputs to hdfs temp = output_rdd.map(ut.toTSVLine).coalesce(1) temp.saveAsTextFile(output_file_path+'in_degree') if graph_statistics.getTotaldge(): output_rdd = deg.statistics_compute(D, 'total') # generate outputs to hdfs temp = output_rdd.map(ut.toTSVLine).coalesce(1) temp.saveAsTextFile(output_file_path+'total_degree') ''' PageRank ''' pr = PageRank() if graph_statistics.getPR(): output_rdd = pr.statistics_compute(D, 19, 0.85, debug_mod) # generate outputs to hdfs temp = output_rdd.map(ut.toTSVLine).coalesce(1) temp.saveAsTextFile(output_file_path+'pagerank') elif graph_statistics.isWeighted() == 1: ''' Degrees ''' deg = Degrees() if graph_statistics.getOutdeg():
'id': 4, 'pageRankScore': 0.0, 'tempPageRankScore': 0.0, 'edgeOut': [], 'edgeIn': [] } addNode(nodeA) addNode(nodeB) addNode(nodeC) addNode(nodeD) addNode(nodeE) addEdge(nodeA, nodeB) addEdge(nodeA, nodeC) addEdge(nodeB, nodeC) addEdge(nodeB, nodeD) addEdge(nodeC, nodeA) addEdge(nodeC, nodeE) addEdge(nodeE, nodeC) return graph p = PageRank(constructGraph()) p.runPageRank() pp(p.graph)