Esempio n. 1
0
def clear_database():
    print("deleting all links:", db.delete_all_links())
    print("deleting all page data:", db.delete_all_page_data())
    print("getting all images:", db.delete_all_images())
    print("deleting all pages:", db.delete_all_pages())
    print("deleting all sites:", db.delete_all_sites())

    print("getting all sites:", db.get_all_sites())
    print("getting all pages:", db.get_all_pages())
    print("getting all images:", db.get_all_images())
    print("getting all page data:", db.get_all_page_data())
    print("getting_all_links:", db.get_all_links())
Esempio n. 2
0
    def get_page_to_crawl(self):
        while True:
            # acquire lock

            all_pages = db.get_all_pages()

            # find first page that has the tag frontier
            page_to_crawl = None
            for page in all_pages:

                if page[2] == "FRONTIER":
                    page_to_crawl = page
                    break
            if page_to_crawl is None:
                #print("---------------------->", threading.get_ident(), "There are no pages available to crawl!")
                return None, None

            # get site url for the first page that has the tag frontier
            page_to_crawl_site = db.get_site_by_id(page_to_crawl[1])

            # check if the domain can be accessed at current time
            how_long_to_wait = hf.how_long_to_wait(page_to_crawl_site[1],
                                                   self.time_accessed,
                                                   self.time_between_calls)

            if how_long_to_wait == 0:
                # if yes, return page and domain, and mark the page as visited (just change the tag to HTML)

                self.lock.acquire()
                updated_page = db.update_page_by_id(
                    page_to_crawl[0], page_to_crawl[1], PAGE_TYPE_CODES[0],
                    page_to_crawl[3], page_to_crawl[4], page_to_crawl[5],
                    page_to_crawl[6], page_to_crawl[7])
                self.lock.release()

                page_to_crawl = updated_page

                return page_to_crawl, page_to_crawl_site

            else:

                time.sleep(how_long_to_wait)
Esempio n. 3
0
page1 = db.insert_page(site[0], PAGE_TYPE_CODES[0], "test.com/index.html", "html_content", "300", "040521")
print("inserted page:", page1)

page2 = db.insert_page(site[0], PAGE_TYPE_CODES[0], "test1.com/index.html", "html_content2", "303", "040522")
print("inserted page:", page2)

'''Insert image'''
image = db.insert_image(page1[0], "slika.jpg", "image/jpg", "asd", "040521")
print("inserted image:", image)

'''Insert page_data'''
page_data = db.insert_page_data(page2[0], DATA_TYPES[0], "asd")
print("page_data_id:", page_data)

'''Insert link'''
link = db.insert_link(page1[0], page2[0])
print("inserted link:", link)

print("getting all sites:", db.get_all_sites())
print("getting all pages:", db.get_all_pages())
print("getting all images:", db.get_all_images())
print("getting all page data:", db.get_all_page_data())
print("getting_all_links:", db.get_all_links())


print("deleting all links:", db.delete_all_links())
print("deleting all page data:", db.delete_all_page_data())
print("getting all images:", db.delete_all_images())
print("deleting all pages:", db.delete_all_pages())
print("deleting all sites:", db.delete_all_sites())