コード例 #1
0
    def insert_page_as_binary(self, data_type):
        #self.lock.acquire()
        db.update_page_by_id(
            self.page_currently_crawling[0], self.page_currently_crawling[1],
            "BINARY", self.page_currently_crawling[3],
            self.page_currently_crawling[4], self.page_currently_crawling[5],
            self.page_currently_crawling[6], self.page_currently_crawling[7])

        db.insert_page_data(self.page_currently_crawling[0], data_type, None)
コード例 #2
0
    def handle_duplicate_page(self):
        # acquire lock
        #self.lock.acquire()

        # Hash of a passed html_content
        h = hash_tool.create_content_hash(self.current_page_html)

        # Check if page is exact copy of already parsed documents in database
        returned_duplicate = db.find_page_duplicate(h)
        if returned_duplicate and returned_duplicate[
                3] != self.page_currently_crawling[3]:
            # Update page as 'DUPLICATE'
            updated_page = db.update_page_by_id(
                self.page_currently_crawling[0],
                self.page_currently_crawling[1], PAGE_TYPE_CODES[1],
                self.page_currently_crawling[3],
                self.page_currently_crawling[4],
                self.page_currently_crawling[5],
                self.page_currently_crawling[6],
                self.page_currently_crawling[7])
            self.page_currently_crawling = updated_page
            print("Page ", self.page_currently_crawling[3],
                  "is a DUPLICATE from", returned_duplicate[3])

            # Save a new link: to_page is set to duplicate version
            db.insert_link(self.page_currently_crawling[0],
                           returned_duplicate[0])

            #self.lock.release()
            return True
        return False
        """ else:
コード例 #3
0
 def insert_accessed_time(self):
     #self.lock.acquire()
     updated_page = db.update_page_by_id(
         self.page_currently_crawling[0], self.page_currently_crawling[1],
         self.page_currently_crawling[2], self.page_currently_crawling[3],
         self.page_currently_crawling[4], self.page_currently_crawling[5],
         self.page_currently_crawling[6], self.accessed_time)
     self.page_currently_crawling = updated_page
コード例 #4
0
 def insert_status_code(self):
     #self.lock.acquire()
     updated_page = db.update_page_by_id(
         self.page_currently_crawling[0], self.page_currently_crawling[1],
         self.page_currently_crawling[2], self.page_currently_crawling[3],
         self.page_currently_crawling[4], self.page_currently_crawling[5],
         self.status_code, self.page_currently_crawling[7])
     self.page_currently_crawling = updated_page
コード例 #5
0
 def insert_html_content(self):
     #self.lock.acquire()
     updated_page = db.update_page_by_id(
         self.page_currently_crawling[0], self.page_currently_crawling[1],
         self.page_currently_crawling[2], self.page_currently_crawling[3],
         self.current_page_html, self.page_currently_crawling[5],
         self.page_currently_crawling[6], self.page_currently_crawling[7])
     self.page_currently_crawling = updated_page
コード例 #6
0
    def insert_page_hash(self):
        # acquire lock
        #self.lock.acquire()
        # Calculate hash from html
        hash = hash_tool.create_content_hash(self.current_page_html)

        # update hash of a page in db
        updated_page = db.update_page_by_id(
            self.page_currently_crawling[0], self.page_currently_crawling[1],
            self.page_currently_crawling[2], self.page_currently_crawling[3],
            self.page_currently_crawling[4], hash,
            self.page_currently_crawling[6], self.page_currently_crawling[7])
        self.page_currently_crawling = updated_page
コード例 #7
0
    def get_page_to_crawl(self):
        while True:
            # acquire lock

            all_pages = db.get_all_pages()

            # find first page that has the tag frontier
            page_to_crawl = None
            for page in all_pages:

                if page[2] == "FRONTIER":
                    page_to_crawl = page
                    break
            if page_to_crawl is None:
                #print("---------------------->", threading.get_ident(), "There are no pages available to crawl!")
                return None, None

            # get site url for the first page that has the tag frontier
            page_to_crawl_site = db.get_site_by_id(page_to_crawl[1])

            # check if the domain can be accessed at current time
            how_long_to_wait = hf.how_long_to_wait(page_to_crawl_site[1],
                                                   self.time_accessed,
                                                   self.time_between_calls)

            if how_long_to_wait == 0:
                # if yes, return page and domain, and mark the page as visited (just change the tag to HTML)

                self.lock.acquire()
                updated_page = db.update_page_by_id(
                    page_to_crawl[0], page_to_crawl[1], PAGE_TYPE_CODES[0],
                    page_to_crawl[3], page_to_crawl[4], page_to_crawl[5],
                    page_to_crawl[6], page_to_crawl[7])
                self.lock.release()

                page_to_crawl = updated_page

                return page_to_crawl, page_to_crawl_site

            else:

                time.sleep(how_long_to_wait)
コード例 #8
0
import db_methods as db

PAGE_TYPE_CODES = ["HTML", "DUPLICATE", "FRONTIER", "BINARY"]
DATA_TYPES = ["DOC", "DOCX", "PDF", "PPT", "PPTX"]
'''Insert site'''
site = db.insert_site("test.com123", "robotstext", "sitemaptext")
print("inserted site:", site)
'''Insert page'''
page1 = db.insert_page(site[0], PAGE_TYPE_CODES[2], "test.com/index.html",
                       "html_content", "300", "040521")
print("inserted page:", page1)

page2 = db.insert_page(site[0], PAGE_TYPE_CODES[2], "test1.com/index.html",
                       "html_content2", "303", "040522")
print("inserted page:", page2)

updated_page1 = db.update_page_by_id(page1[0], page1[1], PAGE_TYPE_CODES[0],
                                     page1[3], page1[4], page1[5], page1[6])

updated_page2 = db.update_page_by_id(page2[0], page2[1], PAGE_TYPE_CODES[0],
                                     page2[3], page2[4], page2[5], page2[6])

print("getting all sites:", db.get_all_sites())
print("getting all pages:", db.get_all_pages())

print("deleting all pages:", db.delete_all_pages())
print("deleting all sites:", db.delete_all_sites())