Ejemplo n.º 1
0
def test_triche_pagerank(n=3000):
    k = 2  #on ajoute 2 personne qui vont tricher
    G = core.construire_G(n)
    vec = np.real(pr.page_rank(G))
    G2 = np.zeros((n + k, n + k))
    G2[:n, :n] = G
    for i in range(k):
        G2[n + i, n - 1] = 1
    vec2 = np.real(pr.page_rank(G2))
    return vec, vec2
Ejemplo n.º 2
0
    def get_docid_score(self):

        page_rank_score = pagerank.page_rank(self._links_set)
        for docid in page_rank_score:
            self._docid_score[docid] = page_rank_score[docid]

        return self._docid_score
    def generate_page_ranks(self, links):
        """Generate page ranks of links and store in database. Return pageranks dictionary"""

        page_ranks = pr.page_rank(links.keys())

        # (Sai) Insert into page rank table in db
        for doc_id, pagerank in page_ranks.items():
            query_str = "INSERT INTO pageRanks VALUES(%d, %f);" % (doc_id,
                                                                   pagerank)
            self._cur.execute(query_str)
            self._db_conn.commit()

        # Pages in page ranks dictionary:
        pages_in_pr = set(page_ranks.keys())

        # Pages in document index:
        doc_index_pages = set()
        for elem in self._document_index:
            doc_index_pages.add(elem[0])

        # Pages in document index but not page ranks
        missing_pages = doc_index_pages - pages_in_pr

        # Add missing pages to page rank table with rank of 0:
        for page in missing_pages:
            query_str = "INSERT INTO pageRanks VALUES(%d, 0);" % page
            self._cur.execute(query_str)
            self._db_conn.commit()

        return page_ranks
Ejemplo n.º 4
0
    def crawl(self, depth=2, timeout=3):
        """Crawl the web!"""
        threadqueue = []
        start = time.time()
        self._max_depth = depth
        for i in range(self.MAX_THREADS):
            thread = threading.Thread(target=self.crawler_thread,
                                      args=(i, depth, timeout))
            thread.start()
            threadqueue.append(thread)

        for i in range(self.MAX_THREADS):
            threadqueue[i].join()

        mid = time.time()
        self._invert_index()
        invert = time.time()
        self._scores = page_rank(self._links, depth)
        scoring = time.time()
        self._store_data()
        end = time.time()
        print "time spent crawling: %d" % (mid - start)
        print "time spent inverting: %d" % (invert - mid)
        print "time spent scoring: %d" % (scoring - invert)
        print "time spent storing: %d" % (end - scoring)
        print "num errors %d" % self.errors
        for word in sorted(self._doc_id_cache):
            #print word
            pass
        links = [(self._doc_cache[x], self._doc_cache[y], d)
                 for x, y, d in self._links]
        for word in sorted(links):
            if word[2] <= depth:
                #print word
                pass
Ejemplo n.º 5
0
 def page_rank_to_DB(self):
     rankings = page_rank(self._links_cache)
     for doc_id in rankings:
         pageRankPost = {
             'doc_id': doc_id,
             'url_ranks': rankings[doc_id]
         }
         pageRankDB.insert_one(pageRankPost)
 def page_rank_calculation(self, iterations=20, initial_pr=0.85 ):
   if self.db_conn.cursor():
     self.cur.execute('SELECT * FROM Links;')
     data = self.cur.fetchall()
     ranked_list = pagerank.page_rank(data, iterations, initial_pr)
     for entry in ranked_list: #self._mock_next_doc_id 
       self.cur.execute( """INSERT OR REPLACE INTO PageRank (doc_id, rank)  VALUES('%s', '%s');""" %  ( entry,  ranked_list[entry]) )
     self.db_conn.commit()
Ejemplo n.º 7
0
    def calculate_pagerank(self):

        ranks = page_rank(self._links)

        for doc_id, rank in ranks.iteritems():

            vals = (doc_id, rank)
            self._cursor.execute(
                "INSERT OR IGNORE  INTO pageRank VALUES (?, ?)", vals)
            self._conn.commit()
Ejemplo n.º 8
0
    def insert_pagerank(self):
        """Insert the page ranking of the specific page accessed"""

        if len(self._links_cache) > 0:
            link_rankings = page_rank(self._links_cache)

            for doc_id, doc_rank in link_rankings.iteritems():
                self._db_cursor.execute(
                    "INSERT OR IGNORE INTO page_rank(doc_id, doc_rank) VALUES (%d, %f);" % (doc_id, doc_rank)
                )
    def generate_page_ranks(self, links):
        """Generate page ranks of links and store in database. Return pageranks dictionary"""
    
        page_ranks = pr.page_rank(links.keys())

        # (Sai) Insert into redis
        for doc_id, pagerank in page_ranks.items():
            redis_ret_val = self._r_conn.zadd("pageranks", str(doc_id), str(pagerank))

        return page_ranks
Ejemplo n.º 10
0
    def crawl(self, depth=2, timeout=3):
        """Crawl the web!"""
        seen = set()
        self._max_depth = depth

        start = time.time()

        while len(self._url_queue):

            url, depth_ = self._url_queue.pop()

            # skip this url; it's too deep
            if depth_ > depth:
                continue

            doc_id = self.document_id(url)

            # we've already seen this document
            if doc_id in seen:
                continue

            seen.add(doc_id)  # mark this document as haven't been visited

            socket = None
            try:
                socket = urllib2.urlopen(url, timeout=timeout)
                soup = BeautifulSoup(socket.read())

                self._curr_depth = depth_ + 1
                self._curr_url = url
                self._curr_doc_id = doc_id
                self._font_size = 0
                self._curr_words = []
                self._index_document(soup)
                self._add_words_to_document()

                print "    url=" + str(self._curr_url)

            except Exception as e:
                print e
                pass
            finally:
                if socket:
                    socket.close()

        mid = time.time()
        self._invert_index()
        self._scores = page_rank(self._links,
                                 max_depth=self._max_depth,
                                 num_iterations=20)
        self._store_data()
        end = time.time()

        print "time spent crawling: %d" % (mid - start)
        print "time spent storing: %d" % (end - mid)
Ejemplo n.º 11
0
    def crawl(self, depth=2, timeout=3):
        """Crawl the web!"""
        seen = set()
        while len(self._url_queue):

            url, depth_ = self._url_queue.pop()

            # skip this url; it's too deep
            if depth_ > depth:
                continue

            doc_id = self.document_id(url)

            # we've already seen this document
            if doc_id in seen:
                continue

            seen.add(doc_id)  # mark this document as haven't been visited

            socket = None
            try:
                socket = urllib2.urlopen(url, timeout=timeout)
                soup = BeautifulSoup(socket.read())

                self._curr_depth = depth_ + 1
                self._curr_url = url
                self._curr_doc_id = doc_id
                self._font_size = 0
                self._curr_words = []
                self._index_document(soup)
                self._add_words_to_document()
                #print ("    url=" + repr(self._curr_url))

            except Exception as e:
                print(e)
                pass
            finally:
                if socket:
                    socket.close()

        # NEW in lab3 add inverted_index in the db
        for i in self._inverted_index:
            l = []
            for j in self._inverted_index[i]:
                l.append(j)
            dic = {'words_id': i, 'doc_id': l}
            store('inverted_index', dic)

        #	NEW FOR LAB3 ADD THE PAGERANK
        self._rank_page = page_rank(
            list(zip(self._from_doc_list, self._to_doc_list)))
        for i in self._rank_page:
            dic = {'doc_id': i, 'score': self._rank_page[i]}
            store('page_rank', dic)
Ejemplo n.º 12
0
def doc_id_index(links, inverted_doc_id, desc):
    """
    Build doc_id_index
    """
    pr_results = page_rank(links)
    mongo_doc = [{"_id": doc_id,
                "pageRank": link_id,
                "url": inverted_doc_id[doc_id],
                "title": desc[doc_id]['title'],
                "description": desc[doc_id]['description']} for doc_id,link_id in pr_results.items()]
    write_records(mongo_doc, "csc326", "doc_id_index")
Ejemplo n.º 13
0
def create_rows(dependencygroup, leadingpath):
    """Returns a list of rows for a dependencygroup."""
    converter = pagerank.DependenciesToLinkMatrix(dependencygroup.dependencies)
    matrix = converter.create_matrix()
    ranking = pagerank.page_rank(matrix)
    ids = [idx for idx in range(len(matrix))]
    filenames = [utils.prettify_path(converter.id_to_node_map[nid], leadingpath) for nid in ids]

    rowinfos = zip(filenames, ranking, ids, matrix)
    rowinfos.sort(key=lambda item: item[1])  # sort by ranking
    rowinfos.reverse()
    return rowinfos
Ejemplo n.º 14
0
def create_rows(dependencygroup, leadingpath):
    """Returns a list of rows for a dependencygroup."""
    converter = pagerank.DependenciesToLinkMatrix(dependencygroup.dependencies)
    matrix = converter.create_matrix()
    ranking = pagerank.page_rank(matrix)
    ids = [idx for idx in range(len(matrix))]
    filenames = [utils.prettify_path(converter.id_to_node_map[nid], leadingpath)
                 for nid in ids]

    rowinfos = zip(filenames, ranking, ids, matrix)
    rowinfos.sort(key=lambda item: item[1]) #sort by ranking
    rowinfos.reverse()
    return rowinfos
Ejemplo n.º 15
0
 def crawler_page_ranks(self):
     calculatedRanks = page_rank(self._links)
     # order by greatest pg to least
     # create a list of tuples
     pageRanks = []
     for page in calculatedRanks:
         for doc_id, url in self.doc_ids:
             if doc_id is page:
                 pageRanks.append((url, calculatedRanks[page]))
     #sort the list by descending page ranks
     pageRanks.sort(key=lambda tup: tup[1], reverse=True)
     # store to database
     self.dbconnection.set('pageranks', pageRanks)
     return pageRanks
Ejemplo n.º 16
0
    def update_database(self):

        con = sqlite3.connect('dbFile.db')
        cur = con.cursor()

        cur.execute("DELETE FROM PageRank_Doc")
        cur.execute("DELETE FROM Words")

        #returns dict of docid -> pagerank number
        page_rank_temp = page_rank(self.get_links())

        for key, value in page_rank_temp.iteritems():
            if key in self._doc_id_to_doc_title_cache:
                # print "INSERT OR IGNORE INTO PageRank_Doc VALUES(" + str(key) + ", " + self._id_doc_cache[key] + ", " + str(value) + ", " + self._doc_id_to_doc_title_cache[key] + ")"
                cur.execute(
                    "INSERT OR IGNORE INTO PageRank_Doc VALUES (?, ?, ?, ?)",
                    (key, self._id_doc_cache[key], value,
                     self._doc_id_to_doc_title_cache[key]))
            else:
                # print "INSERT OR IGNORE INTO PageRank_Doc VALUES(" + str(key) + ", " + self._id_doc_cache[key] + ", " + str(value) + ", " + "Unknown" + ")"
                cur.execute(
                    "INSERT OR IGNORE INTO PageRank_Doc VALUES (?, ?, ?, ?)",
                    (key, self._id_doc_cache[key], value, "Unknown Title"))

        for key, value in self._doc_id_cache.items():
            # if doc_id does not exist inside database, because it does not have a page rank score, save it's page rank score as 0.
            if value not in page_rank_temp:
                if value in self._doc_id_to_doc_title_cache:
                    # print "INSERT OR IGNORE INTO PageRank_Doc VALUES(" + str(value) + ", " + str(key) + ", " + "0 " + ", " + self._doc_id_to_doc_title_cache[value] + ")"
                    cur.execute(
                        "INSERT OR IGNORE INTO PageRank_Doc VALUES (?, ?, ?, ?)",
                        (value, key, 0,
                         self._doc_id_to_doc_title_cache[value]))
                else:
                    # print "INSERT OR IGNORE INTO PageRank_Doc VALUES(" + str(value) + ", " + str(key) + ", " + "0 " + ", " + "unknown" + ")"
                    cur.execute(
                        "INSERT OR IGNORE INTO PageRank_Doc VALUES (?, ?, ?, ?)",
                        (value, key, 0, "Unknown Title"))

        # key = word_id, value = word (string)
        for key, value in self._id_word_cache.items():
            for id in set(self._word_id_to_doc_id_cache[key]):
                print "INSERT OR IGNORE INTO PageRank_Doc VALUES(" + str(
                    key) + ", " + str(value) + ", " + str(id) + ")"
                cur.execute("INSERT OR IGNORE INTO Words VALUES (?, ?, ?)",
                            (key, value, id))

        con.commit()
        con.close()
Ejemplo n.º 17
0
    def crawl(self, depth=2, timeout=3):
        """Crawl the web!"""
        seen = set()

        while len(self._url_queue):

            url, depth_ = self._url_queue.pop()

            # skip this url; it's too deep
            if depth_ > depth:
                continue

            doc_id = self.document_id(url)

            # we've already seen this document
            if doc_id in seen:
                continue

            seen.add(doc_id) # mark this document as haven't been visited
            
            socket = None
            try:
                socket = urllib2.urlopen(url, timeout=timeout)
                soup = BeautifulSoup(socket.read())

                self._curr_depth = depth_ + 1
                self._curr_url = url
                self._curr_doc_id = doc_id
                self._font_size = 0
                self._curr_words = [ ]
                self._index_document(soup)
                self._add_words_to_document()
                print "    url="+repr(self._curr_url)

            except Exception as e:
                print e
                pass
            finally:
                if socket:
                    socket.close()
        print(self.links)
        self.pagerank = pagerank.page_rank(self.links, num_iterations=20, initial_pr=1.0)
       # thread.start_new_thread(self.pagerank_db_update,("1",self))
       # thread.start_new_thread(self.index_db_update,("1",self))

        thread.start_new_thread(update_everything,("1",self))
        self.pagerank_db_update()
        self.index_db_update()
Ejemplo n.º 18
0
def backEnd_run(dep):
    # Crawl through the URLs provided in urls.txt
    crawler.crawl(depth=int(dep))

    # Retrieve Data needed for populating the SQL Tables
    doc_index = crawler.get_docs_cache()
    inverted_index = crawler.get_inverted_index()
    anchor_db = crawler.get_anchor_db()
    lexicon = crawler.get_lexicon()
    pg_rank = page_rank(crawler.get_links_queue())
    titles_list = crawler.get_title_cache()
    resolved_inverted_index = crawler.get_resovled_inverted_index()
    description = crawler.get_desc_cache()
    images = crawler.get_image_cache()

    return doc_index, titles_list, lexicon, anchor_db, pg_rank, inverted_index, description, images, resolved_inverted_index
Ejemplo n.º 19
0
    def crawl(self, depth=2, timeout=3):
        """Crawl the web!"""
        seen = set()

        while len(self._url_queue):
            url, depth_ = self._url_queue.pop()

            # skip this url; it's too deep
            if depth_ > depth:
                continue

            doc_id = self.document_id(url)

            # we've already seen this document
            if doc_id in seen:
                continue

            seen.add(doc_id)  # mark this document as haven't been visited

            socket = None
            try:
                socket = urllib2.urlopen(url, timeout=timeout)
                soup = BeautifulSoup(socket.read())

                self._curr_depth = depth_ + 1
                self._curr_url = url
                self._curr_doc_id = doc_id
                self._font_size = 0
                self._curr_words = []
                self._index_document(soup)
                self._add_words_to_document()
                print "    url=" + repr(self._curr_url)

            except Exception as e:
                print e
                pass
            finally:
                if socket:
                    socket.close()

        pageranks = page_rank(self.links)
        print "pagerank length: ", len(pageranks)
        print
        for page in pageranks.keys():
            if page in self._documents.keys():
                print self._documents[page].title, page, pageranks[page]
                self._documents[page].pagerank = pageranks[page]
Ejemplo n.º 20
0
    def crawl(self, depth=2, timeout=3):
        """Crawl the web!"""
        seen = set()

        while len(self._url_queue):

            url, depth_ = self._url_queue.pop()

            # skip this url; it's too deep
            if depth_ > depth:
                continue

            doc_id = self.document_id(url)

            # we've already seen this document
            if doc_id in seen:
                continue

            seen.add(doc_id)  # mark this document as haven't been visited

            socket = None
            try:
                socket = urllib2.urlopen(url, timeout=timeout)
                soup = BeautifulSoup(socket.read())

                self._curr_depth = depth_ + 1
                self._curr_url = url
                self._curr_doc_id = doc_id
                self._font_size = 0
                self._curr_words = []
                self._index_document(soup)
                self._add_words_to_document()
                self._update_inverted_index()
                print "    url=" + repr(self._curr_url)

            except urllib2.URLError as e:
                print e
                pass
            finally:
                if socket:
                    socket.close()
        pr = pagerank.page_rank([(elem.from_doc.id, elem.to_doc.id)
                                 for elem in Links.select()])
        for doc_id, rank_score in pr.iteritems():
            print doc_id, rank_score
            Documents.update(page_rank=rank_score).where(
                Documents.id == doc_id).execute()
Ejemplo n.º 21
0
def pagerank_score_degre_entrant(n=20):
    G = core.construire_G(n)
    vec = pr.page_rank(G)
    vec = [int(10000 * np.real(x)) / 10000 for x in vec]
    degres = np.sum(G, axis=1).astype(int)
    dic = {}
    for i in range(len(degres)):
        d = degres[i]
        if not d in dic:
            dic[d] = []
        dic[d].append(vec[i])
    dic = {d: np.average(dic[d]) for d in dic}
    X = np.array(list(dic))
    Y = np.array([dic[d] for d in X])
    i = np.argsort(X)
    plt.plot(X[i], Y[i], marker='o')
    plt.show()
Ejemplo n.º 22
0
    def crawl(self, depth=2, timeout=3):
        """Crawl the web!"""
        
        seen = set()

        while len(self._url_queue):
            url, depth_ = self._url_queue.pop()

            # skip this url; it's too deep
            if depth_ > depth:
                continue

            doc_id = self.document_id(url)

            # we've already seen this document
            if doc_id in seen:
                continue
           
            seen.add(doc_id) # mark this document as haven't been visited
            
            socket = None
            
            try:
                socket = urllib2.urlopen(url, timeout=timeout)
                soup = BeautifulSoup(socket.read())
                self._curr_depth = depth_ + 1
                self._curr_url = url
                self._curr_doc_id = doc_id
                self._font_size = 0
                self._curr_words = [ ]
                
                self._index_document(soup)
                self._add_words_to_document()
                
            except Exception as e:
                print e
                pass
            finally:
                if socket:
                    socket.close()
                    
        rank = None
        if (self._link_db) :
            rank = pr.page_rank(self._link_db)
        self.update_db(rank)
Ejemplo n.º 23
0
    def crawl(self, depth=2, timeout=3):
        """Crawl the web!"""

        seen = set()

        while len(self._url_queue):
            url, depth_ = self._url_queue.pop()

            # skip this url; it's too deep
            if depth_ > depth:
                continue

            doc_id = self.document_id(url)

            # we've already seen this document
            if doc_id in seen:
                continue

            seen.add(doc_id)  # mark this document as haven't been visited

            socket = None

            try:
                socket = urllib2.urlopen(url, timeout=timeout)
                soup = BeautifulSoup(socket.read())
                self._curr_depth = depth_ + 1
                self._curr_url = url
                self._curr_doc_id = doc_id
                self._font_size = 0
                self._curr_words = []

                self._index_document(soup)
                self._add_words_to_document()

            except Exception as e:
                print e
                pass
            finally:
                if socket:
                    socket.close()

        rank = None
        if (self._link_db):
            rank = pr.page_rank(self._link_db)
        self.update_db(rank)
Ejemplo n.º 24
0
    def crawl(self, depth=2, timeout=3):
        """Crawl the web!"""
        seen = set()

        while len(self._url_queue):

            url, depth_ = self._url_queue.pop()

            # skip this url; it's too deep
            if depth_ > depth:
                continue

            doc_id = self.document_id(url)

            # we've already seen this document
            if doc_id in seen:
                continue

            seen.add(doc_id) # mark this document as haven't been visited

            socket = None
            try:
                socket = urllib2.urlopen(url, timeout=timeout)
                soup = BeautifulSoup(socket.read())

                self._curr_depth = depth_ + 1
                self._curr_url = url
                self._curr_doc_id = doc_id
                self._font_size = 0
                self._curr_words = [ ]
                self._index_document(soup)
                self._add_words_to_document()
                self._update_inverted_index()
                print "    url="+repr(self._curr_url)

            except urllib2.URLError as e:
                print e
                pass
            finally:
                if socket:
                    socket.close()
        pr = pagerank.page_rank([(elem.from_doc.id, elem.to_doc.id) for elem in Links.select()])
        for doc_id, rank_score in pr.iteritems():
            print doc_id, rank_score
            Documents.update(page_rank=rank_score).where(Documents.id==doc_id).execute()
Ejemplo n.º 25
0
    def get_page_rank(self):
        link_page_rank = page_rank(self._links)
        # inv_dict = dict((v,k) for k, v in self._links_dict.iteritems())
        # for l_id, rank in link_page_rank.iteritems():
        #     link_url = inv_dict[l_id]
        #     if link_url in self._doc_id_cache:
        #         doc_id = self._doc_id_cache[link_url]
        #         self._page_rank[doc_id] = rank
        # for d_url in self._doc_id_cache:
        #     doc_id = self._doc_id_cache[d_url]
        #     if doc_id not in self._page_rank:
        #         self._page_rank[doc_id] = 0
        for doc_id, rank in link_page_rank.iteritems():
            self._page_rank[doc_id] = rank

        for url, doc_id in self._doc_id_cache.iteritems():
            if doc_id not in self._page_rank:
                self._page_rank[doc_id] = 0
Ejemplo n.º 26
0
def test_page_rank():
    test_crawler = crawler(None, "")
    
    # test values
    DOC_ID_A = 1
    DOC_ID_B = 2
    DOC_ID_C = 3
    DOC_ID_D = 4
    
    # Initialize crawler needed for the function call
    test_crawler.add_link(DOC_ID_A, DOC_ID_B)
    test_crawler.add_link(DOC_ID_B, DOC_ID_D)
    test_crawler.add_link(DOC_ID_D, DOC_ID_C)
    test_crawler.compute_page_rank()
    # Expected and actual result comparison
    expected_result = page_rank([(DOC_ID_A,DOC_ID_B), (DOC_ID_B, DOC_ID_D), (DOC_ID_D, DOC_ID_C)])
    actual_result = test_crawler._page_rank

    # If the two results equal return true
    if cmp(expected_result, actual_result) == 0:
        return True
    else:
        return False
Ejemplo n.º 27
0
 def update_page_rank(self):
     """
     The function will insert all the links relations from the url_list file into DB
     :return:
     """
     rank_dict = page_rank(self._in_out_links)
     print self._in_out_links
     with self._db_conn:
         c = self._db_conn.cursor()
         # for every document that we have crawled
         for doc_id in self._doc_id_cache.values():
             # if there is a rank for this document
             if doc_id in rank_dict:
                 _rank = rank_dict[doc_id]
             # if there is no rank for this document, meaning nothing links to the page
             else:
                 _rank = 0
             # if there was no such entry for DocId then this will create the rank entry
             # but there was an entry for DocId then nothing will be updated
             c.execute("INSERT OR IGNORE INTO PageRank (DocId, rank) VALUES (?,?)",
                     (doc_id, _rank))
             # Here we make sure ranks will be update-to-date even if they existed before
             c.execute("UPDATE PageRank SET Rank=? WHERE DocId=?",
                     (_rank, doc_id))
Ejemplo n.º 28
0
    # Testing of the inverted and resolved indexes
    # just print out their values
    #document_index = bot.get_document_index()
    document_index = bot.get_document_index_dict()
    #print "\nDocument Index\n~~~~~~~~~~~~~~\n", document_index
    lexicon = bot.get_lexicon()
    #print "\nLexicon\n~~~~~~~\n", lexicon
    inverted_index = bot.get_inverted_index()
    #print "\nInverted Index\n~~~~~~~~~~~~~~\n", inverted_index
    resolved_inverted_index = bot.get_resolved_inverted_index()
    #print "\nResolved Index\n~~~~~~~~~~~~~~\n", resolved_inverted_index

    # LAB 3
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # compute page ranks of crawled pages
    pagerank_dict = page_rank(bot.get_list_of_links(), num_iterations=20)

    # make an upgraded resolved_inverted_index
    # maps word strings to ordered list of tuples
    # tuples = (url string, page title, page rank score)
    word_to_sorted_list_of_urls = {}
    for word, url_set in resolved_inverted_index.items():
        # combine each url with its page rank into a tuple
        # sort that list of tuples by page rank
        # store new sorted list of url tuples into new data structure
        newlist = []
        for a_url in url_set:
            a_url_id = bot.document_id(a_url)
            a_url_rank = pagerank_dict[a_url_id]
            a_url_title = document_index[a_url_id][0]
            newlist.append((a_url, a_url_title, a_url_rank))
Ejemplo n.º 29
0
    def crawl(self, depth=2, timeout=3):
        """Crawl the web!"""
        seen = set()

        while len(self._url_queue):

            url, depth_ = self._url_queue.pop()

            # skip this url; it's too deep
            if depth_ > depth:
                continue

            doc_id = self.document_id(url)

            # we've already seen this document
            if doc_id in seen:
                continue

            seen.add(doc_id) # mark this document as haven't been visited
            
            socket = None
            try:
                socket = urllib2.urlopen(url, timeout=timeout)
                soup = BeautifulSoup(socket.read())

                self._curr_depth = depth_ + 1
                self._curr_url = url
                self._curr_doc_id = doc_id
                self._font_size = 0
                self._curr_words = [ ]
                self._index_document(soup)
                self._add_words_to_document()
                print "    url="+repr(self._curr_url)

            except Exception as e:
                print e
                pass
            finally:
                if socket:
                    socket.close()
        #print self._from_to_id
        #print self._index_cache
        z=open('title.txt','r+')
        for x,y in self._title_id_cache.iteritems():
         x=x.replace("u'",'')
         a = str(y) + ' , ' + x + '\n'
         z.write(a)
        z.close()

        g=open('dbwords.txt','r+')
        for x,y in self._index_cache.iteritems():
         b = str(x) + ',' + str(y) + '\n'
         g.write(b)
        g.close()

	h=open('dbdocs.txt','r+')
        s1=""
        for x,y in self._doc_id_cache.iteritems():
          if "u'" in repr(x):
            s1= repr(x).replace("u'"," ",1)
          if 'u"' in repr(x):
            s1= repr(x).replace('u"'," ",1)
          s2=s1.replace("'","")
          s3=s2.replace('"',"")
          c = str(y)+ ' , '+ s3 + '\n'
          h.write(c)
        h.close()
        pagerank_results=pagerank.page_rank(self._from_to_id)
        #print pagerank_results
        f=open('prresults.txt','r+')
        for x,y in pagerank_results.iteritems():
          d = str(x) + ',' + str(y) + '\n'
          f.write(d)
        f.close()
Ejemplo n.º 30
0
from crawler import crawler
from pagerank import page_rank
import pprint

if __name__ == "__main__":
    bot = crawler(None, "urls.txt")
    bot.crawl(depth=1)

    # print "bot.links: "
    # print bot.links

    pageranks = page_rank(
        bot.links)  #calculates the page rank score and stores the data on disk

    pprint.pprint(pageranks)
Ejemplo n.º 31
0
 def _calculate_page_rank(self):
     return page_rank(self.links_by_doc_id)
Ejemplo n.º 32
0
 def get_page_ranks(self):
     return dict(pagerank.page_rank(self._url_pairs))
Ejemplo n.º 33
0
def main():
  '''
  The menu shown at the beginning of the Page Rank program. It asks whether the
  file is a csv or snap file, then the file name.
   
  Ex:
    > python lab3.py [-w]
    CSC 466: Lab 3 - Page Rank & Link Analysis
    Parse:
      1) csv
      2) snap
    (User enters 1 or 2)
    File name: (User enters file name here)
   
  There is an optional flag '-w' that is used for the Football csv. The program
  outputs every 1000 lines (to ensure that it's parsing) and then at the end of
  the page rank algorithm, print out the top 20 nodes and how long it took to 
  calculate page rank.
   
  Note: -w doesn't quite work at the moment. Please ignore it for now.
  '''
  
  is_weighted = False      # Used for '-w' flag

  # Setting variable if '-w' is used
  if len(sys.argv) > 1:
    if sys.argv[1] == '-w':
      is_weighted = True

  # Menu
  print('CSC 466: Lab 3 - PageRank & Link Analysis')
  parse_menu = raw_input('Parse:\n' +
                         '1) csv\n' +
                         '2) snap\n'
                        )
  file_name = raw_input('File name: ')
  
  # PARSING - CSV Files
  # Note: The algorithm is the same, just parsing is different.
  if parse_menu == '1':
    print('Parsing/Creating Graph...')
    start = time.time()    # Tracking time
    
    # Parses a csv file and returns a tuple (list, dictionary, dictionary)
    if is_weighted == False:
      (nodes, out_degrees, in_degrees) = parser.parse_csv(file_name)
    else:
      (nodes, out_degrees, in_degrees) = parser.parse_weighted_csv(file_name)
      
    end = time.time()
    print('Parse/Graph Set-up Time: ' + str(end - start) + ' seconds')

    # Sets up page rank structures
    pagerank.set_up(nodes, out_degrees, in_degrees)

    # PAGE RANKING
    print('Page Ranking...')
    start = time.time()
    num_iters = pagerank.page_rank(0)  # Stores # of page rank iterations
    end = time.time()
    
    # Statistics
    print('Page Rank Time: ' + str(end-start) + ' seconds')
    print('Page Rank Iterations: ' + str(num_iters))

  # PARSING - SNAP Files
  elif parse_menu == '2':
    print('Parsing/Creating Graph...')
    start = time.time()    # Tracking time
    
    # Parses a SNAP file and returns a tuple (list, dictionary, dictionary)
    (nodes, out_degrees, in_degrees) = parser.parse_snap(file_name)
    
    end = time.time()
    print('Parse/Graph Set-up Time: ' + str(end-start) + 'seconds')

    # Sets up page rank structures
    pagerank.set_up(nodes, out_degrees, in_degrees)

    # PAGE RANKING
    print('Page Ranking...')
    start = time.time()
    num_iters = pagerank.page_rank(0)  # Stores # of page rank iterations
    end = time.time()
    
    # Statistics
    print('Page Rank Time: ' + str(end-start) + ' seconds')
    print('Page Rank Iterations: ' + str(num_iters))
  
  # Wrong input
  else:
    print('Invalid input - exiting')
Ejemplo n.º 34
0
from crawler import crawler
from pagerank import page_rank

# Get crawler object and crawl on urls found in urls.txt
crawler = crawler(None, 'urls.txt')
crawler.crawl()

document_index = crawler.get_document_index()

# Run pagerank on the links generated by the crawler
pagerank = page_rank(crawler._links)

for doc_id, rank in sorted(pagerank.iteritems(), key=lambda (k,v): (v,k), reverse=True):
    document = crawler._document_index[doc_id]
    print str(rank) + " : " + str(document[0]) + "\n"
Ejemplo n.º 35
0
 def get_raw_page_rank(self):
     if len(self._url_pairs) > 0:
         self._page_rank = pagerank.page_rank(self._url_pairs)
     return self._page_rank
Ejemplo n.º 36
0
    # call crawler with depth 1
    # this will populate our database with all relavent information
    bot = crawler(con, "urls.txt")
    bot.crawl(depth=1)

    
    # extract all links from database
    with con:    
    
        cur = con.cursor()    
        cur.execute("SELECT * FROM link")
        con.commit()
        links = cur.fetchall()

    # calculate the page rank links
    page_rank = page_rank(links)

    # convert page_rank dict into a list of tuples
    page_rank_tuple=page_rank.items()

    # update the document table in the database with page ranks
    with con:
        cur = con.cursor() 

        for x,y in page_rank_tuple:
            cur.execute("UPDATE document SET page_rank = ? WHERE id= ?", (float(y),int(x)))
            cur.execute("UPDATE url_list SET page_rank = ? WHERE doc_id= ?", (1000*float(y),int(x)))
        con.commit()

    # find max links    
    """
Ejemplo n.º 37
0
 def compute_page_rank(self):
     self._page_rank = page_rank(self._page_rank_list)
Ejemplo n.º 38
0
 def _insert_pagerank(self):
     """Insert generated score for each page or link to database PageRank"""
     if len(self._link_list) > 0:
         _ranked_list = pagerank.page_rank(self._link_list)
         self._db.put_pageranks(_ranked_list)
Ejemplo n.º 39
0
 def get_page_ranks(self):
     return dict(pagerank.page_rank(self._url_pairs))
Ejemplo n.º 40
0
 def insert_pagerank_to_db(self):
     """ Insert rankings of pages/documents to database"""
     if len(self._links_cache) > 0:
         rankings = page_rank(self._links_cache)
         for doc_id, doc_rank in rankings.iteritems():
             self._db_cursor.execute('INSERT INTO PageRank(doc_id, doc_rank) VALUES (%d, %f);' % (doc_id, doc_rank) )
import crawler
import pagerank
import redis
import json
import pprint
import os

bot = crawler.crawler(None, "urls.txt")
bot.crawl(depth=1)
page_rank = pagerank.page_rank(bot._links)
inverted_index = bot.get_inverted_index()
resolved_inverted_index = bot.get_resolved_inverted_index()
lexicon = bot._doc_id_cache
url_lexicon = bot._url_lexicon

# convert dictionary to json string data and convert all sets in values to list form
json_page_rank = json.dumps(dict(page_rank))
json_inverted_index = {k: list(v) for k, v in inverted_index.items()}
json_inverted_index = json.dumps(json_inverted_index)
json_resolved_inverted_index = {
    k: list(v)
    for k, v in resolved_inverted_index.items()
}
json_resolved_inverted_index = json.dumps(json_resolved_inverted_index)
json_lexicon = json.dumps(dict(lexicon))

# store all data to redis database
# redis_db = redis.StrictRedis(host="localhost", port=6379, db=0)
# redis_db.set('page_rank',json_page_rank)
# redis_db.set('inverted_index',json_inverted_index)
# redis_db.set('resolved_inverted_index',json_resolved_inverted_index)
Ejemplo n.º 42
0
    def crawl(self, depth=2, timeout=3):
        """Crawl the web!"""
        seen = set()

        while len(self._url_queue):

            url, depth_ = self._url_queue.pop()

            # skip this url; it's too deep
            if depth_ > depth:
                continue

            doc_id = self.document_id(url)

            # we've already seen this document
            if doc_id in seen:
                continue

            seen.add(doc_id)  # mark this document as haven't been visited

            socket = None
            try:
                socket = urllib2.urlopen(url, timeout=timeout)
                soup = BeautifulSoup(socket.read())

                self._curr_depth = depth_ + 1
                self._curr_url = url
                self._curr_doc_id = doc_id
                self._font_size = 0
                self._curr_words = []
                # keep track of word location for current docId
                self._curr_wordIndex = 0
                self._index_document(soup, doc_id)
                self._add_words_to_document()
                self._add_first_p_to_document(soup)
                print "    url=" + repr(self._curr_url)

            except Exception as e:
                print e
                pass
            finally:
                if socket:
                    socket.close()

        # After crawling, save all data to database

        # save lexicon data to presistent storage
        # lexicon: wordId (INTEGER PRIMARY KEY), word (TEXT)
        self.cur.execute(''' CREATE TABLE IF NOT EXISTS lexicon
                (wordId INTEGER PRIMARY KEY, word TEXT);
            ''')
        lexiconData = [(int(wordId), word)
                       for word, wordId in self._word_id_cache.items()]
        self.cur.executemany(
            ''' INSERT INTO lexicon VALUES (?,?)
            ''', lexiconData)
        self.db_conn.commit()

        # save documentId data to presistent storage
        # documentId: docId (INTEGER PRIMARY KEY), url (TEXT)
        self.cur.execute(''' CREATE TABLE IF NOT EXISTS documentId
                (docId INTEGER PRIMARY KEY, url TEXT);
            ''')
        documentIdData = [(int(docId), str(url))
                          for url, docId in self._doc_id_cache.items()]
        self.cur.executemany(
            ''' INSERT INTO documentId VALUES (?,?)
            ''', documentIdData)
        self.db_conn.commit()

        # save invertedId data to presistent storage
        # invertedId: wordId (INTEGER), docId (INTEGER)
        self.cur.execute(''' CREATE TABLE IF NOT EXISTS invertedId
                (wordId INTEGER, docId INTEGER);
            ''')
        invertedIdData = []
        for wordId in self._inverted_index.keys():
            for docId in self._inverted_index[wordId]:
                invertedIdData.append((int(wordId), int(docId)))
        self.cur.executemany(
            ''' INSERT INTO invertedId VALUES (?,?)
            ''', invertedIdData)
        self.db_conn.commit()

        # save pageRankScores data to presistent storage
        # pageRankScores: docId (INTEGER), score (REAL)
        pageRankScores = pagerank.page_rank(self._from_to_links)
        self.cur.execute(''' CREATE TABLE IF NOT EXISTS pageRankScores
                (docId INTEGER, score REAL);
            ''')
        pageRankScoresData = [(int(docId), float(score))
                              for docId, score in pageRankScores.items()]
        unscoredLinks = [(int(docId), float(0.0))
                         for docId in self._doc_id_cache.values()
                         if docId not in pageRankScores.keys()]
        self.cur.executemany(
            ''' INSERT INTO pageRankScores VALUES (?,?)
            ''', pageRankScoresData)
        self.cur.executemany(
            ''' INSERT INTO pageRankScores VALUES (?,?)
            ''', unscoredLinks)
        self.db_conn.commit()

        # save docTitle data to presistent storage
        # docTitle: docId (INTEGER), title (TEXT)
        self.cur.execute(''' CREATE TABLE IF NOT EXISTS docTitle
                (docId INTEGER, title TEXT);
            ''')
        docTitles = [(int(docId), str(title))
                     for docId, title in self._doc_title_cache.items()]
        self.cur.executemany(
            ''' INSERT INTO docTitle VALUES (?,?)
            ''', docTitles)
        self.db_conn.commit()

        # save docWordHits data to presistent stroage
        # docWordHits: docId (INTEGER), wordId (INTEGER), fontSize (INTEGER), wordLocation (INTEGER)
        self.cur.execute(''' CREATE TABLE IF NOT EXISTS docWordHits
                (docId INTEGER, wordId INTEGER, fontSize INTEGER, wordLocation INTEGER);
            ''')
        wordHits = []
        for docId in self._doc_wordHits_cache.keys():
            for hit in self._doc_wordHits_cache[docId]:
                wordHits.append(
                    (int(docId), int(hit[0]), int(hit[1]), int(hit[2])))
        self.cur.executemany(
            ''' INSERT INTO docWordHits VALUES (?,?,?,?)
            ''', wordHits)
        self.db_conn.commit()

        # save docAnchorHits data to presistent stroage
        # docAnchorHits: docId (INTEGER), wordId (INTEGER), anchorFontSize (INTEGER)
        self.cur.execute(''' CREATE TABLE IF NOT EXISTS docAnchorHits
                (docId INTEGER, wordId INTEGER, anchorFontSize INTEGER);
            ''')
        anchorHits = []
        for docId in self._doc_anchorHits_cache.keys():
            for hit in self._doc_anchorHits_cache[docId]:
                anchorHits.append((int(docId), int(hit[0]), int(hit[1])))
        self.cur.executemany(
            ''' INSERT INTO docAnchorHits VALUES (?,?,?)
            ''', anchorHits)
        self.db_conn.commit()

        # save docSnippet data to presistent stroage
        # docSnippet: docId (INTEGER), snippet (TEXT)
        self.cur.execute(''' CREATE TABLE IF NOT EXISTS docSnippet
                (docId INTEGER, snippet TEXT);
            ''')
        docSnippets = [(docId, snippet)
                       for docId, snippet in self._doc_snippet_cache.items()]
        self.cur.executemany(
            ''' INSERT INTO docSnippet VALUES (?,?)
            ''', docSnippets)
        self.db_conn.commit()
Ejemplo n.º 43
0
 def rank_page(self):
     self._page_ranks = pagerank.page_rank(self._link_cache.keys())
Ejemplo n.º 44
0
 def compute_page_rank(self):
     """Call the page rank function with the _page_rank_list input to compute the score for each doc"""
     self._page_rank = page_rank(self._page_rank_list)