def __init__(self):
        myCrawler = Crawler(self.LINKS)
        crawledURLs = myCrawler.getVisited()
        linkStructure = myCrawler.getLinkStructure()
        print("Link-Struktur:\n")
        myCrawler.printLinkStructure()

        myPageRank = PageRank(linkStructure)
        pageRanks = myPageRank.getPageRank()
        print("\n\nPageRanks:\n")
        myPageRank.printPageRank()

        myIndex = Index(self.STOPWORDS, crawledURLs)
        index = myIndex.getIndex()
        print("\n\nIndex:\n")
        myIndex.printIndex()

        myScorer = Scorer(pageRanks, index,linkStructure)
        #myScorer.usePageRank(True)
        print("\n\nDokumentenlängen:\n")
        myScorer.printDocumentLengths()
        print("\n\nSuchergebnisse:\n")
        myScorer.calculateScores(["tokens"])
        myScorer.calculateScores(["index"])
        myScorer.calculateScores(["classification"])
        myScorer.calculateScores(["tokens", "classification"])
Example #2
0
def get_mr_job(iteration, threshold):
  '''Returns MRJob depending on iteration

  First iteration uses the input file, every iteration after that
  will use the previous iterations output until Rank converges

  PARAMETERS
  ----------
  iteration: int
    current PageRank iteration

  threshold: float
    Defines convergence threshold for rank

  RETURNS
  -------
  PageRank: MRJob
    PageRank job
  '''

  output = f'--output-dir={OUTDIR}{iterations}'
  if not iterations:
    return PageRank([json_file, output, threshold])
  else:
    input_dir = f'{OUTDIR}{iterations-1}'
    return PageRank([input_dir, output, threshold])
Example #3
0
    def test_mapper(self):
        mr_input = data['page_rank']['mapper_input']
        results = {}
        mr_job = PageRank(['./test_input.json', '--rank-threshold=0.1'])

        results = [(k, v) for input_key, input_val in mr_input.items()
                   for k, v in mr_job.mapper(input_key, input_val)]
        expectation = [(k, v) for k, v in data['page_rank']['mapper_output']]

        differences = [item for item in results if item not in expectation]

        self.assertEqual(len(differences), 0)
Example #4
0
    def page_rank_util(self):
        # print('Starting Page rank...........')
        pagerank = PageRank(self.random_walk, self.teleportation)
        final_steady_state = pagerank.get_final_steady_state()
        # print(final_steady_state)
        for i in range(len(self.images_list)):
            self.page_ranking[self.images_list[i]] = final_steady_state[i][0]

        # Ordering the page ranking based on the rank
        sorted_pagerank = sorted(self.page_ranking.items(),
                                 key=lambda kv: kv[1],
                                 reverse=True)
        self.page_ranking = dict(collections.OrderedDict(sorted_pagerank))
Example #5
0
    def __init__(self, file_name):
        """Creates a search engine backed by PageRank and TF-IDF

        Args:
            file_name: path to xml files of wiki dump
        """
        # build corpus from xml files
        self.corpus, self.links = build_corpus(file_name)
        self.tf_idf = TFIDF(self.corpus)
        print("TFIDF engine has started")
        self.reverse_index = {word: set(mapping.keys())
                              for word, mapping in self.tf_idf.tf_idf.items()}
        self.page_rank = PageRank(self.links, self.tf_idf.tf_idf)
        print("PageRank engine has started")
Example #6
0
def big_graph(file_path):
    graph = PageGraph(file_path)
    graph.fetch_graph()
    page_ranker = PageRank(graph)
    page_ranker.rank("big")
    print("Top 50 pages sorted by PageRank:")
    page_ranker.sort_by_pr()
    print("Top 50 pages sorted by in-link count:")
    page_ranker.sort_by_inlink()
Example #7
0
class SearchEngine:
    """
    SearchEngine determines certain search engine based on the user's choice and returns the score of query words.
    """
    def __init__(self, file_name):
        """Creates a search engine backed by PageRank and TF-IDF

        Args:
            file_name: path to xml files of wiki dump
        """
        # build corpus from xml files
        self.corpus, self.links = build_corpus(file_name)
        self.tf_idf = TFIDF(self.corpus)
        print("TFIDF engine has started")
        self.reverse_index = {word: set(mapping.keys())
                              for word, mapping in self.tf_idf.tf_idf.items()}
        self.page_rank = PageRank(self.links, self.tf_idf.tf_idf)
        print("PageRank engine has started")

    def search(self, query, mode, limit=10):
        """Sends `process_text(query)` to the search engines selected by `mode` and returns article
            titles and associated scores up to `limited`. Results are sorted by their scores in
            a descending order.

        Args:
            query: raw query string
            mode: 'TF-IDF|PageRank|smart'
            limit: int

        Returns:
            A list of tuples. Each tuple is a document title and score pair.
        """
        keywords = process_text(query)  # process a raw query string to a cleaner version, remove
        # all the punctuations and white spaces
        if mode == 'TF-IDF':
            return self.tf_idf.search(keywords, limit)
        elif mode == 'PageRank':
            return self.page_rank.search(keywords, limit)
        elif mode == 'smart':
            return self.smart_search(keywords, limit)
        raise ValueError('Undefined search mode')

    def smart_search(self, keywords, limit=None):
        """
        Returns the score of certain query words based on TFIDF score and pagerank score. 
        """
        smart_scores = {}
        tf_idf = self.tf_idf.tf_idf
        page_rank = self.page_rank.page_rank
        for word in keywords:
            if word in self.reverse_index:
                for page in self.reverse_index[word]:
                    if page not in smart_scores:
                        smart_scores[page] = 0
                    smart_scores[page] += tf_idf[word][page] + page_rank[page]
        result = sorted(smart_scores.items(), key=lambda x: x[1], reverse=True)
        return result[:limit]
Example #8
0
    def test_reducer(self):
        map_output = data['page_rank']['mapper_output']
        mr_input = defaultdict(list)

        for key, val in map_output:
            mr_input[key].append(val)

        results = {}
        mr_job = PageRank(['./test_input.json', '--rank-threshold=0.1'])

        results = [(k, v) for map_key, map_val in mr_input.items()
                   for k, v in mr_job.reducer(map_key, map_val)]
        expectation = [(k, tuple(v))
                       for k, v in data['page_rank']['reducer_output']]

        differences = [item for item in results if item not in expectation]

        self.assertEqual(len(differences), 0)
Example #9
0
def run(edge_file, node_num, beta=0.85, epsilon=1e-6, max_iterations=20):
    """Calls various ranking functions and print the rank_vectors.
	
	
	Parameters
	----------
	edge_file : string
		Path to the file where edges of web-graph are stored.

	node_num : int
		Number of nodes in the web-graph.
	
	beta : float, optional
		Probability with which teleports will occur.
		Default value : 0.85
	
	epsilon : float, optional
		A small value and total error in ranks should be less than epsilon.
		Default value : 1e-6
	
	max_iterations : int, optional
		Maximum number of times to apply power iteration.
		Default value : 20

	
	Returns
	-------
	None

	"""
    gg = getGraph(edge_file)
    edges = gg.get_connections()

    print("got edges...")

    pr = PageRank(beta, edges, epsilon, max_iterations, node_num)
    PageRank_vector = pr.pageRank()
    print(PageRank_vector, sum(PageRank_vector))

    tr = TrustRank(beta, edges, epsilon, max_iterations, node_num,
                   PageRank_vector)
    TrustRank_vector = tr.trustRank()
    print(TrustRank_vector, sum(TrustRank_vector))
Example #10
0
def big_graph(file_path):
    graph = PageGraph(file_path)
    graph.fetch_graph()
    page_ranker = PageRank(graph)
    page_ranker.rank("big")
    print("Top 50 pages sorted by PageRank:")
    page_ranker.sort_by_pr()
    print("Top 50 pages sorted by in-link count:")
    page_ranker.sort_by_inlink()
Example #11
0
def main():

	crawler = Crawler([
		"http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d01.html",
		"http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d06.html",
		"http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d08.html"
	])
	crawler.crawl()

	bank = crawler.get_bank()
	bank.sortBank()

	print '\nLinkstruktur: \n' 
	bank.printOutgoing()

	print '\nPageRanks:'
	rank = PageRank(bank, 0.95, 0.04)
	rank.calculate()

	print '\n\nIndex: \n'
	i = Index( bank )
	i.printIndex()

	s = Scorer( 'tokens', i )
	
	print '\nDokumentenlaenge: \n'
	s.printDocumentLength()
	
	print '\nSuchergebnisse: \n'
	s.printScoring()
	s = Scorer( 'index', i )
	s.printScoring()
	s = Scorer( 'classification', i )
	s.printScoring()
	s = Scorer( 'tokens classification', i )
	s.printScoring()
Example #12
0
def test_graph_score():
    g = Graph()
    g.insert_edge(0, 1)
    g.insert_edge(0, 4)
    g.insert_edge(1, 2)
    g.insert_edge(1, 3)
    g.insert_edge(2, 0)
    g.insert_edge(3, 2)
    g.save_graph()
    p = PageRank(g)
    p.iterate(max_iter=None)
    p.save_score()

    return True
Example #13
0
def main(content_path='Contents', graph_path='Data/graph.p', bir_name=None):
    if not bir_name:
        # Create BIR
        bir = BIR(normalization_factor=2)

    else:
        with open("{}/{}".format('Data', bir_name), 'rb') as f:
            bir = p.load(f)
    n_file = 0
    for filename in os.listdir(content_path):
        if not filename.startswith('.'):
            idx = int(filename[:-4])  # Content file must be stored as doc_id.txt
            with open("{}/{}".format(content_path, filename), 'r') as f:
                if n_file % 100 == 0:
                    print("Have Parsed {} documents".format(n_file))
                kw = extract_keywords(f.read())
                bir.insert_document(doc=kw, idx=idx)
                n_file += 1
    # Calculate and save tf-idf table
    print("Create BIR and tf-idf for all documents...")
    bir.create_and_save_tf_idf(filename='tf_idx.p', path=os.getcwd())
    # Save the BIR to a pickle file
    print("Saving Inverted Index Table...")
    bir.save(path=os.getcwd())

    # Upload the web graph
    with open(graph_path, 'rb') as f:
        graph = p.load(f)

    print("Calculate PageRank ...")
    # Build PageRank using the web graph
    pagerank = PageRank(graph, prev_path=None, damping_factor=0.32, epsilon=0.0000001, default_weight=None)
    # Iterate until converge
    pagerank.iterate(max_iter=100000)

    # Save the PageRank score
    pagerank.save_score(filename=None, path=os.getcwd())
    print("Finished!!!")

    return True
Example #14
0
from MakeDataSimple import MakeDataSimple
from PageRank import PageRank
from SortedPageTitleByPageRank import *
import time

if __name__ == '__main__':
    start = time.time()
    page_links_reader = ReadPageLinksFile('./viwiki-20170901-pagelinks.sql')
    page_title_reader = ReadPageTitleFile('./viwiki-20170901-page.sql')
    make_data_simple = MakeDataSimple()

    print("-----Start Page Title Reader-----")
    page_title_reader.start()
    n_page = page_title_reader.get_total_field()

    print("-----Start Page Links Reader-----")
    page_links_reader.start()

    print("-----Make Data Simple-----")
    make_data_simple.start()

    print("-----Calculate Page Rank------")
    page_rank = PageRank(n_page=n_page, max_iterator=100, n_thread=6)
    page_rank.start()

    print("-----Sort page title by page rank------")
    write_sorted_page_title_by_page_rank()

    print("n_page {}".format(n_page))
    print("Total time: {}".format(time.time() - start))
Example #15
0
from Database import WikiDb
from PageRank import PageRank
import math

print "Added db init"


def weight_function(frequency, tf, N):
    return math.log(float(N) / frequency + 1) * tf


article_json = [
    ('wiki_json/female_explorers.json', 'Female Explorers'),
    ('wiki_json/women_nobel_laureates.json', 'Women Nobel Laureates'),
    ('wiki_json/women_computer_scientists.json', 'Women Computer Scientists'),
    ('wiki_json/women_company_founders.json', 'Women Company Founders'),
    ('wiki_json/women_prime_ministers.json', 'Women Prime Ministers'),
]

article_db = WikiDb(article_json)
lookup_table = PageRank(weight_function,
                        len(article_db.article_id_to_metadata))

print "populating database"
for entry in article_db.db_entries():
    lookup_table.populate(article_db.get_article_content_by_id(entry), entry)
print "populated database"

# Now you can query article_db and lookup_table using the standard APIs.
Example #16
0
from sklearn.metrics import cohen_kappa_score
from costcla.models import CostSensitiveDecisionTreeClassifier

CONFIG_FILE = Commons.readConfigFile()
datasetFilePath = CONFIG_FILE["dataset-path"]
outputFilePath = CONFIG_FILE["output-path"]
hostGraphFileName = datasetFilePath + CONFIG_FILE["host-graph-file"]

NUM_NODES = 114529  # from analysis before
SPAM_LABEL = 0
NON_SPAM_LABEL = 1
'''
 1. Construct Graph From File
'''
graph = Commons.constructGraph(hostGraphFileName)
pr = PageRank(graph)
'''
 1. Run Page Rank
 2. Pickle Page-Rank Dictionary

ranks = pr.pageRank(None)
filename = outputFilePath + CONFIG_FILE["page-rank-file"]
# pr.savePageRanksToDisk(filename, ranks)
'''
'''
TRUST RANK
 1. Extract Seeds from Training File to be used as Preference Vector (1 if node is non-spam else 0)
    Normalization of this vector done inside page-rank comoutation
 2. Run Page Rank with Preference Vector = Trust Rank (with Dampening and Splitting)
 3. Pickle Trust Ranks
def rank():
    pg = PageRank("hollins.dat")
    write(pg.run(.85), "output1.txt")
    write(pg.run(.95), "output2.txt")
    write(pg.run(.5), "output3.txt")
Example #18
0
from PageRank import PageRank
import numpy as np

# بهترین مقدار برای alpha=0.85
pagerank=PageRank(0.001,0.85)

adjacency_matrix=pagerank.input_array()

#تمرین سوال 1
adjacency_matrix = np.array([[0., 1., 0., 0., 0., 1.],
                           [0., 0., 1., 0., 0., 1.],
                           [0., 0., 0., 0., 1., 0.],
                           [0., 1., 0., 0., 1., 0.],
                           [0., 0., 0., 0., 0., 0.],
                           [0., 0., 1., 1., 0., 0.]])

adjacency_matrix_T=pagerank.Transpose_Matrix(adjacency_matrix)
print(adjacency_matrix_T)

Convert_to_markov=pagerank.Spars_Matrix(adjacency_matrix_T)
print(Convert_to_markov)

Sparse_matrix=pagerank.Spars_Matrix(Convert_to_markov)

print(Sparse_matrix)


v = np.zeros(adjacency_matrix_T.shape[0]).reshape(adjacency_matrix_T.shape[0], -1)
v_sparce=pagerank.Spars_Matrix(v)

e = np.ones(adjacency_matrix_T.shape[0]).reshape(adjacency_matrix_T.shape[0], -1)
Example #19
0
def small_graph(file_path):
    graph = PageGraph(file_path)
    graph.fetch_graph()
    page_ranker = PageRank(graph)
    page_ranker.rank("small")
Example #20
0
    print "Loading ",
    with open("Wiki-Vote.txt", mode='r') as data_file:
        for line in data_file:
            if line[0] == '#':
                continue
            line = line.replace('\n', '')
            ij_list = line.split('\t')
            i = index_map.getIndex(int(ij_list[0]))
            j = index_map.getIndex(int(ij_list[1]))
            A_t[j, i] = 1
            A[i, j] = 1
    print "is finished."

    print " [1] : SCC .............................................. "
    scc = SCC(A)
    A_temp = scc.removeDeadEnds()
    print "> PageRank started..."
    pr = PageRank(beta=0.8, max_err=0.0001)
    pr.initTransposedMat(A_temp.transpose())
    pr.normalize()
    iter_count = pr.run()
    print "PageRank finished."
    print "> Propagate scores of PageRank"
    v = scc.computeRanks(pr.v)
    indexes = np.array(v).argsort()[-10:][::-1]
    print "Best nodes:"
    for index in indexes:
        print '\t', index_map.nodes[index], '\t', v[index]
    print " [1]; ................................................... "
    # print ""
    line = line.replace('\n', '').split(',')
    if line[0] == 'id':
        continue
    G.node[line[0]]["cluster"] = int(line[1])
cluster_file.close()
"""
Parameters
"""
taxation = 0.2
tol = 1e-5

# path to twitter graph file
graph = "twitter_combined.txt"

## PageRank
pg = PageRank(graph)
pg_value = pg.basic_pagerank(taxation, tol)
t_pg_value = pg.tensor_pagerank(topic, taxation, tol)

## HITS
hits = HITS(graph)
hits_h_value, hits_a_value = hits.basic_hits(tol)
t_hits_h_value, t_hits_a_value = hits.tensor_hits(topic, taxation, tol)

## Find the top influential nodes based on different
pg_value_rank = np.argsort(-pg_value)
t_pg_value_rank = []
for i in range(len(t_pg_value)):
    t_pg_value_rank.append(np.argsort(-t_pg_value[i]))

hits_h_value_rank = np.argsort(-hits_h_value)
Example #22
0
    )

    print(
        "Since there is no pre-crawled data, do you want to crawl from the beginning ?"
    )
    choice = input(
        "CAUTION!!! Crawling can run for hours! Your choice ? (y/n) :")

    if choice == 'y':
        baseurl = input(
            "Enter the start web-page to initiate crawl from (eg. https://www.cs.uic.edu) : "
        )
        maxP = int(
            input(
                "Enter the max number of pages you want to try downloading: "))
        pagerank = PageRank()
        spider = NoogleSpider(baseurl, pagerank, maxPages=maxP)
        spider.crawl()

        print(
            "Pickling the web structure as 'prankNoScores' file for future pagerank calculation..."
        )
        with open('prankNoScores', 'wb') as outf:
            pickle.dump(pagerank, outf)
    else:
        exit()

actPages = set(pagerank.adjList.keys())
tPages = pagerank.pages & actPages

flag = input("Do you want to run the pagerank iteration ? (y/n): ")
Example #23
0
            # generate outputs to hdfs
            temp = total_deg_rdd.map(ut.toTSVLine).coalesce(1)
            temp.saveAsTextFile(output_file_path + 'total_degree')

        if graph_statistics.getTotalDeg_vs_Count():
            output_rdd = deg.statistics_compute(D, 'total')
            deg_vs_count_rdd = deg.deg_vs_count(output_rdd)

            # generate outputs to hdfs
            temp = deg_vs_count_rdd.map(ut.toTSVLine).coalesce(1)
            temp.saveAsTextFile(output_file_path + 'deg_vs_count')
        '''
        PageRank
        '''
        pr = PageRank()

        if graph_statistics.getPR():
            pr_rdd = pr.statistics_compute(D, Iter, 0.85, debug_mod)

            # generate outputs to hdfs
            temp = pr_rdd.map(ut.toTSVLine).coalesce(1)
            temp.saveAsTextFile(output_file_path + 'pagerank')

        if graph_statistics.getPR_vs_Count():
            pr_rdd = pr.statistics_compute(D, Iter, 0.85, debug_mod)
            [centers, counts] = pr.pr_vs_count(pr_rdd, N)
            centers = sc.parallelize(centers)
            counts = sc.parallelize(counts)
            pr_vs_count = centers.zip(counts)
Example #24
0
from PageRank import PageRank
from MyWoosh import MyWoosh
from collections import Counter

searchQuery = "red"
mixedResult = Counter()
pg = PageRank("aula04_links.txt", 0.1)
mw = MyWoosh("aula03_cfc.txt")

#mw.createIndex()
print "Converged in ", pg.runUntilConvergence(), "Iterations"

search = mw.searchWord(searchQuery)
for doc in search.viewkeys():
    mixedResult += Counter({doc : search[doc] * pg.getScoreOfDocument(doc)*pg.numVertices})

print mixedResult.most_common(5)
Example #25
0
from PageRank import PageRank


nodes = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
edges = [
	('A','B'),
	('A','C'),
	('B','C'),
	('B',		'D'),
	('C',		'D'),
	('D',		'C'),
	('E',		'D'),
	('F',		'D'),
	('E',		'F'),
	('F',		'E'),
	('G',		'A'),
	('A',		'G'),
	('C',		'G'),
	('B',		'G'),
	]
pagerank = PageRank(nodes, edges)

for node, rank in pagerank.ranking():
    print str((node,rank))
Example #26
0
from IRModel import Vectoriel, ModeleLangue, Okapi
from PageRank import PageRank

parser = Parser()
parser.buildDocCollectionSimple("data\cisi\cisi.txt")

docs = parser.getListDocs()

#creation des index
indexSimpler = IndexSimpler(parser.getCollection())
indexSimpler.indexation()
indexSimpler.indexation_tf_idf()
indexSimpler.indexationHyperLinks()

#chargement des requetes
queryParser = QueryParser()
queryCollection = queryParser.buildCollectionQuery("data\cisi\cisi.qry",
                                                   "data\cisi\cisi.rel")

query = queryCollection[1]

weighter1 = Weighter.Weighter1(indexSimpler)

vectoriel = Vectoriel(indexSimpler, weighter1, normalized=True)

#------------------------------------test du Page Rank---------------------

pageRank = PageRank(vectoriel, weighter1, n=5, k=3, d=0.85)
listDocs = pageRank.get_scores(query.getText(), max_iter=100)
print("Page rank: liste des documents avec leur score : ", listDocs[:20])
Example #27
0
            
            # generate outputs to hdfs
            temp = output_rdd.map(ut.toTSVLine).coalesce(1)
            temp.saveAsTextFile(output_file_path+'in_degree')
            
        if graph_statistics.getTotaldge():
            output_rdd = deg.statistics_compute(D, 'total')
            
            # generate outputs to hdfs
            temp = output_rdd.map(ut.toTSVLine).coalesce(1)
            temp.saveAsTextFile(output_file_path+'total_degree')
            
        '''
        PageRank
        '''       
        pr = PageRank() 
        
        if graph_statistics.getPR():
            output_rdd = pr.statistics_compute(D, 19, 0.85, debug_mod)
            
            # generate outputs to hdfs
            temp = output_rdd.map(ut.toTSVLine).coalesce(1)
            temp.saveAsTextFile(output_file_path+'pagerank')
      

    elif graph_statistics.isWeighted() == 1:
        '''
        Degrees
        '''
        deg = Degrees()
        if graph_statistics.getOutdeg():
Example #28
0
        'id': 4,
        'pageRankScore': 0.0,
        'tempPageRankScore': 0.0,
        'edgeOut': [],
        'edgeIn': []
    }

    addNode(nodeA)
    addNode(nodeB)
    addNode(nodeC)
    addNode(nodeD)
    addNode(nodeE)

    addEdge(nodeA, nodeB)
    addEdge(nodeA, nodeC)

    addEdge(nodeB, nodeC)
    addEdge(nodeB, nodeD)

    addEdge(nodeC, nodeA)
    addEdge(nodeC, nodeE)

    addEdge(nodeE, nodeC)

    return graph


p = PageRank(constructGraph())
p.runPageRank()
pp(p.graph)