Python update_page_by_idの例

プログラミング言語: Python

名前空間/パッケージ名: db_methods

メソッド/関数: update_page_by_id

hotexamples.comのコード掲載数: 8

Python update_page_by_id - 8件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのdb_methods.update_page_by_idの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

0

ファイルを表示

ファイル: crawler.py プロジェクト: LukaZeleznik/wieramemo_vase

    def insert_page_as_binary(self, data_type):
        #self.lock.acquire()
        db.update_page_by_id(
            self.page_currently_crawling[0], self.page_currently_crawling[1],
            "BINARY", self.page_currently_crawling[3],
            self.page_currently_crawling[4], self.page_currently_crawling[5],
            self.page_currently_crawling[6], self.page_currently_crawling[7])

        db.insert_page_data(self.page_currently_crawling[0], data_type, None)

コード例 #2

0

ファイルを表示

ファイル: crawler.py プロジェクト: LukaZeleznik/wieramemo_vase

    def handle_duplicate_page(self):
        # acquire lock
        #self.lock.acquire()

        # Hash of a passed html_content
        h = hash_tool.create_content_hash(self.current_page_html)

        # Check if page is exact copy of already parsed documents in database
        returned_duplicate = db.find_page_duplicate(h)
        if returned_duplicate and returned_duplicate[
                3] != self.page_currently_crawling[3]:
            # Update page as 'DUPLICATE'
            updated_page = db.update_page_by_id(
                self.page_currently_crawling[0],
                self.page_currently_crawling[1], PAGE_TYPE_CODES[1],
                self.page_currently_crawling[3],
                self.page_currently_crawling[4],
                self.page_currently_crawling[5],
                self.page_currently_crawling[6],
                self.page_currently_crawling[7])
            self.page_currently_crawling = updated_page
            print("Page ", self.page_currently_crawling[3],
                  "is a DUPLICATE from", returned_duplicate[3])

            # Save a new link: to_page is set to duplicate version
            db.insert_link(self.page_currently_crawling[0],
                           returned_duplicate[0])

            #self.lock.release()
            return True
        return False
        """ else:

コード例 #3

0

ファイルを表示

ファイル: crawler.py プロジェクト: LukaZeleznik/wieramemo_vase

 def insert_accessed_time(self):
     #self.lock.acquire()
     updated_page = db.update_page_by_id(
         self.page_currently_crawling[0], self.page_currently_crawling[1],
         self.page_currently_crawling[2], self.page_currently_crawling[3],
         self.page_currently_crawling[4], self.page_currently_crawling[5],
         self.page_currently_crawling[6], self.accessed_time)
     self.page_currently_crawling = updated_page

コード例 #4

0

ファイルを表示

ファイル: crawler.py プロジェクト: LukaZeleznik/wieramemo_vase

 def insert_status_code(self):
     #self.lock.acquire()
     updated_page = db.update_page_by_id(
         self.page_currently_crawling[0], self.page_currently_crawling[1],
         self.page_currently_crawling[2], self.page_currently_crawling[3],
         self.page_currently_crawling[4], self.page_currently_crawling[5],
         self.status_code, self.page_currently_crawling[7])
     self.page_currently_crawling = updated_page

コード例 #5

0

ファイルを表示

ファイル: crawler.py プロジェクト: LukaZeleznik/wieramemo_vase

 def insert_html_content(self):
     #self.lock.acquire()
     updated_page = db.update_page_by_id(
         self.page_currently_crawling[0], self.page_currently_crawling[1],
         self.page_currently_crawling[2], self.page_currently_crawling[3],
         self.current_page_html, self.page_currently_crawling[5],
         self.page_currently_crawling[6], self.page_currently_crawling[7])
     self.page_currently_crawling = updated_page

コード例 #6

0

ファイルを表示

ファイル: crawler.py プロジェクト: LukaZeleznik/wieramemo_vase

    def insert_page_hash(self):
        # acquire lock
        #self.lock.acquire()
        # Calculate hash from html
        hash = hash_tool.create_content_hash(self.current_page_html)

        # update hash of a page in db
        updated_page = db.update_page_by_id(
            self.page_currently_crawling[0], self.page_currently_crawling[1],
            self.page_currently_crawling[2], self.page_currently_crawling[3],
            self.page_currently_crawling[4], hash,
            self.page_currently_crawling[6], self.page_currently_crawling[7])
        self.page_currently_crawling = updated_page

コード例 #7

0

ファイルを表示

ファイル: crawler.py プロジェクト: LukaZeleznik/wieramemo_vase

    def get_page_to_crawl(self):
        while True:
            # acquire lock

            all_pages = db.get_all_pages()

            # find first page that has the tag frontier
            page_to_crawl = None
            for page in all_pages:

                if page[2] == "FRONTIER":
                    page_to_crawl = page
                    break
            if page_to_crawl is None:
                #print("---------------------->", threading.get_ident(), "There are no pages available to crawl!")
                return None, None

            # get site url for the first page that has the tag frontier
            page_to_crawl_site = db.get_site_by_id(page_to_crawl[1])

            # check if the domain can be accessed at current time
            how_long_to_wait = hf.how_long_to_wait(page_to_crawl_site[1],
                                                   self.time_accessed,
                                                   self.time_between_calls)

            if how_long_to_wait == 0:
                # if yes, return page and domain, and mark the page as visited (just change the tag to HTML)

                self.lock.acquire()
                updated_page = db.update_page_by_id(
                    page_to_crawl[0], page_to_crawl[1], PAGE_TYPE_CODES[0],
                    page_to_crawl[3], page_to_crawl[4], page_to_crawl[5],
                    page_to_crawl[6], page_to_crawl[7])
                self.lock.release()

                page_to_crawl = updated_page

                return page_to_crawl, page_to_crawl_site

            else:

                time.sleep(how_long_to_wait)

コード例 #8

0

ファイルを表示

import db_methods as db

PAGE_TYPE_CODES = ["HTML", "DUPLICATE", "FRONTIER", "BINARY"]
DATA_TYPES = ["DOC", "DOCX", "PDF", "PPT", "PPTX"]
'''Insert site'''
site = db.insert_site("test.com123", "robotstext", "sitemaptext")
print("inserted site:", site)
'''Insert page'''
page1 = db.insert_page(site[0], PAGE_TYPE_CODES[2], "test.com/index.html",
                       "html_content", "300", "040521")
print("inserted page:", page1)

page2 = db.insert_page(site[0], PAGE_TYPE_CODES[2], "test1.com/index.html",
                       "html_content2", "303", "040522")
print("inserted page:", page2)

updated_page1 = db.update_page_by_id(page1[0], page1[1], PAGE_TYPE_CODES[0],
                                     page1[3], page1[4], page1[5], page1[6])

updated_page2 = db.update_page_by_id(page2[0], page2[1], PAGE_TYPE_CODES[0],
                                     page2[3], page2[4], page2[5], page2[6])

print("getting all sites:", db.get_all_sites())
print("getting all pages:", db.get_all_pages())

print("deleting all pages:", db.delete_all_pages())
print("deleting all sites:", db.delete_all_sites())