def add_links_to_frontier(self): print("self.links_to_crawl: ", len(self.links_to_crawl)) for link in self.links_to_crawl: current_link_url = link.geturl() # print("SCHEME: --->", link.scheme) current_link_domain = link.scheme + "://" + link.netloc #print("current_link_domain: ", current_link_domain) # Only scrape sites in the gov.si domain if not self.check_if_current_domain_is_allowed( current_link_domain): continue # Only add pages in the allowed domain #self.lock.acquire() #all_sites = db.get_all_sites() #all_pages = db.get_all_pages() # check if the link exists in any of the pages in db # check if the domain of the link already exists in db same_domain = False #domain_id = self.return_domain_if_it_already_exists(all_sites, current_link_domain) domain_id_or_false = db.check_site_exists(current_link_domain) if not domain_id_or_false: # new domain robotstext_content, sitemap_content = Crawler.get_robots_and_sitemap_content( current_link_domain) new_site = db.insert_site(current_link_domain, robotstext_content, sitemap_content) if self.check_if_page_is_allowed_by_robots_txt( new_site, current_link_url): new_page = db.insert_page(new_site[0], PAGE_TYPE_CODES[2], current_link_url, "", "", "200", "040521") db.insert_link(self.page_currently_crawling[0], new_page[0]) #print("inserting new page new domain") else: # existing domain if self.check_if_page_is_allowed_by_robots_txt( self.site_currently_crawling, current_link_url): #print("inserting", current_link_url) new_page = db.insert_page(domain_id_or_false, PAGE_TYPE_CODES[2], current_link_url, "", "", "200", "040521")
def insert_seed_urls_into_db(): for seed_url in SEED_URLS: page_obj = urllib.parse.urlparse(seed_url) current_url = page_obj.geturl() current_site_url = page_obj.scheme + "://" + page_obj.netloc robotstext_content, sitemap_content = Crawler.get_robots_and_sitemap_content(current_site_url) current_site = db.insert_site(current_site_url, robotstext_content, sitemap_content) current_page = db.insert_page(current_site[0], PAGE_TYPE_CODES[2], current_url, "", "","200", "040521")
import db_methods as db PAGE_TYPE_CODES = ["HTML","DUPLICATE","FRONTIER","BINARY"] DATA_TYPES = ["DOC","DOCX","PDF","PPT","PPTX"] '''Insert site''' site = db.insert_site("test.com123", "robotstext", "sitemaptext") print("inserted site:", site) '''Insert page''' page1 = db.insert_page(site[0], PAGE_TYPE_CODES[0], "test.com/index.html", "html_content", "300", "040521") print("inserted page:", page1) page2 = db.insert_page(site[0], PAGE_TYPE_CODES[0], "test1.com/index.html", "html_content2", "303", "040522") print("inserted page:", page2) '''Insert image''' image = db.insert_image(page1[0], "slika.jpg", "image/jpg", "asd", "040521") print("inserted image:", image) '''Insert page_data''' page_data = db.insert_page_data(page2[0], DATA_TYPES[0], "asd") print("page_data_id:", page_data) '''Insert link''' link = db.insert_link(page1[0], page2[0]) print("inserted link:", link) print("getting all sites:", db.get_all_sites()) print("getting all pages:", db.get_all_pages()) print("getting all images:", db.get_all_images()) print("getting all page data:", db.get_all_page_data())