def get_urls_from_text(url): links = set() text = get_text(url) words = text.split() for word in words: if '.onion' in word: index = word.index('.onion') new_url = word[index-16:index] + '.onion' links.add(new_url) for link in links: try: link = get_url(link) found = crawler_backend.check_url(link) if found: print('\n' + link + ' is already in the database.') print('Skipped...') continue else: print('\nInserting... ' + link) crawler_backend.insert_page(link) crawler_backend.insert_status(link, 'Alive') print('Status : Alive') except: print('\nInserting... ' + link) link = 'http://' + link crawler_backend.insert_page(link) crawler_backend.insert_status(link, 'Offline') print('Status : Offline') pass
def crawl(url): url = get_url(url) links = get_links(url) for link in links: try: link = get_url(link) found = crawler_backend.check_url(link) if found: print('\n' + link + ' is already in the database.') print('Skipped...') continue else: print('\nInserting... ' + link) crawler_backend.insert_page(link) crawler_backend.insert_status(link, 'Alive') print('Status : Alive') except: print('\nInserting... ' + link) link = 'http://' + link crawler_backend.insert_page(link) crawler_backend.insert_status(link, 'Offline') print('Status : Offline') pass
def depth_crawl(url, depth): url = get_url(url) robots_list = get_robots_list(url) if url in robots_list: print('Not allowed to crawl url: ' + url + '\n') sys.exit() found = crawler_backend.check_url(url) if found: print(url + ' has already been crawled!\n') sys.exit() count = 0 while count < depth: count += 1 print('\nGetting level ' + str(count) + ' links...\n') try: if count == 1: links_list = get_links(url) for link in links_list: retrieve_content(link) # Pause after each page time = random.randint(1, 5) sleep(time) else: for link in links_list: if link == url: pass temp_links = get_links(link) for temp_link in temp_links: if temp_link not in links_list: links_list.append(temp_link) retrieve_content(temp_link) # Pause after each page time = random.randint(1, 5) sleep(time) except KeyboardInterrupt: print('') print('Program interrupted by user...') break except: check_error_status(url) print() crawler_backend.insert_content(url, 'ERROR: Page skipped') continue
def random_crawl(url, number_of_pages): url = get_url(url) count = 0 robots_list = get_robots_list(url) if url in robots_list: print('Not allowed to crawl url: ' + url + '\n') url = find_new_url(url) found = crawler_backend.check_url(url) if found: print(url + ' has already been crawled!\n') print('Searching for a new page to crawl...\n') try: url = find_new_url(url) except: print('Unable to find any valid links\n') print('The application will now terminate!\n') sys.exit() while count < number_of_pages: try: retrieve_content(url) except KeyboardInterrupt: print('') print('Program interrupted by user...') break except: check_error_status(url) print() crawler_backend.insert_content(url, 'ERROR: Page skipped') url = find_new_url(url) continue count += 1 # Pause after each page time = random.randint(1, 5) sleep(time) try: url = find_new_url(url) except: break
def find_new_url(url): page_found_counter = 0 while True: url = change_url(url) found = crawler_backend.check_url(url) if not found: page_found_counter = 0 url = get_url(url) return url else: print('Searching for a new page to crawl... (%s)\n' % url) page_found_counter += 1 if page_found_counter == 30: print('\nThere are no more pages to collect') sys.exit() else: continue