def gather_links(page_url): html_string = '' if 'com/photos' in page_url: try: datalist = [] urllist = [] response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode('utf-8') content = html_string.split() for i in content: if i.startswith('src="'): datalist.append(i) print("\n\n") print(datalist) print("\n\n") for num in datalist: if num not in urllist: urllist.append(num) print("\n\n") print(urllist) print("\n\n") for i in urllist: if 'images.unsplash.com' in i: download(i) finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() else: try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page_url): try: response = urlopen(page_url) if response.getheader('Content-Type').split( '/')[0] in Spider.bannedResponses: raise Exception("Invalid Response") if response.getheader('Content-Type').split( '/')[1] in Spider.bannedResponses: raise Exception("Invalid Response") if response.getheader('Content-Type').split(';')[0] == 'text/html': html_bytes = response.read() html_string = html_bytes.decode("utf-8") soup = BeautifulSoup(html_string, 'html.parser') title = soup.find('title') title = title.text finder = LinkFinder(Spider.base_url, page_url) finder.handle_starttag(html_string) aTextItem = textItem(parse.urljoin(Spider.base_url, page_url), finder.return_url_text(html_string)) Spider.textDict.update({Spider.dictCount: aTextItem}) Spider.titleDict.update({page_url: title}) Spider.dictCount += 1 return finder.page_links() except Exception as ex: print(ex) print('Error : can not crawled page ' + parse.urljoin(Spider.base_url, page_url)) if page_url not in Spider.crawled: Spider.countOfNotCrawledPgages += 1 return set()
def gather_links(page_url): # Create a variable/object to store HTML request's response html_string = '' # Enclose in a try-except block to handle exceptions during connections try: # Get the response after trying to connect to a webpage response = urlopen(page_url) # Check if response contains, text/html as Content-Type in header if 'text/html' in response.getheader('Content-Type'): # Read the response byte wise html_bytes = response.read() # Decode the response from byte order to human readable format # And store in variable/object created earlier to store response html_string = html_bytes.decode('utf-8') # Create a LinkFinder() object to start parsing webpages finder = LinkFinder(Spider.base_url, page_url) # Start parsing webpages using HTMLParser class's feed function finder.feed(html_string) # Catch exception except Exception as e: # Print exception info to console print(str(e)) # Since exception occured, return empty set() object return set() # If all operations are successful, return results return finder.page_links()
def gather_links(page_url): html_string = "" try: response = urlopen(page_url) if "text/html" in response.getheader("content-Type"): zipped_html_bytes = response.read() if Spider.html_gzipped: try: html_bytes = gzip.decompress(zipped_html_bytes) except IOError: Spider.html_gzipped = False html_bytes = zipped_html_bytes else: html_bytes = zipped_html_bytes try: html_string = html_bytes.decode("utf-8") except UnicodeDecodeError: try: html_string = html_bytes.decode("gbk") except Exception as e: print(e) finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(e) print("Error: can not craw page.") return set() response.close() return finder.page_links()
def gather_links(page_url): """ connects to site takes the html converts from html bytes to proper readable string passes to LinkFinder, LinkFinder parses throught and get all the links of the url. if theres no error then return., else it will return an empty set with the message "error: cannot crawl page!" """ html_string = '' #using error catching on networking try: response = urlopen(page_url) #make sure its a html page and not some pdf format if 'text/html' in response.getheader('Content-Type'): #python read in html bytes format html_bytes = response.read() #convert into human readable character (utf-8) html_string = html_bytes.decode('utf-8') #create a linkfinder object finder = LinkFinder(Spider.base_url, page_url) #feed in the html strings finder.feed(html_string) except: print('Error: cannot crawl page!') return set() return finder.page_links()
def gather_links(page_url): html_string = Spider.connect(page_url) if html_string is None: return set() # if there is an error return an empty set finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) # pass in html data return finder.page_links() # if there is no error return the set of page links
def gather_links(page_url): try: finder = LinkFinder(Spider.base_url, page_url) finder.feed(CustomConnection.URL(page_url)) except: return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: responce = requests.get(page_url) html_string = responce.content soup = BeautifulSoup(html_string) finder = LinkFinder(soup)
def gather_links_and_text(page_url): if page_url in Spider.crawled: Spider.queue.remove(page_url) print("***************************** Duplicate found!!!!!!!!!!!!!!!!") return set() else: html_string = '' try: article = Article(page_url, language='bn') article.download() article.parse() html_string = article.html Spider.news += article.title + '\n' + article.text Spider.page_count += 1 file = codecs.open(Spider.html_pages + randomString(8) + '.html', "a", "utf-8") file.write(html_string) file.close() if Spider.page_count % 100 == 0: with codecs.open(Spider.project_name + '/all_texts.txt', "a", "utf-8") as w: for l in Spider.news: w.write(l) Spider.news = "" # find the links finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def crawl_page(thread_name, page_url): if (page_url not in (Spider.crawled | Spider.finish)): finder = LinkFinder(Spider.base_url, page_url) Spider.add_links_to_queue_finish(finder.links()) Spider.queue.remove(page_url) Spider.crawled.add(page_url) Spider.update_files()
def gather_links(page_url): html_string = '' try: request = urllib2.Request(page_url) response = urllib2.urlopen(request) # response = urllib2.urlopen(page_url) u = response.info().getheader('Content-Type') print u # print Spider.Type if u.find(Spider.Type) != -1: vv = page_url if vv not in Spider.downloaded: download_file('./' + Spider.projectname, vv) Spider.downloaded.add(vv) if u.startswith('text/html'): if Spider.Type == "image": Spider.get_images(page_url) html_bytes = response.read() html_string = html_bytes.decode('utf-8') # print "here" finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception, err: # print('Error: cannot crawl page') print Exception, err return set()
def gather_links(page_url): html_string = '' returnlinks = set() try: user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' headers = {'Connection' : 'keep-alive', 'User-Agent': user_agent,} request=Request(page_url,None,headers) #The assembled request response = urlopen(request) returnheader = response.getheader('Content-Type') html_bytes = response.read() if 'text/html' in returnheader: html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) foundlinks = finder.page_links() #returnlinks = foundlinks returnlinks = Spider.cull(foundlinks, page_url, response) response.close() except URLError: print('error encountered, most likely a 404\n') return set() return returnlinks
def boot(): create_project_dir(Spider.project_name) create_data_files(Spider.project_name, Spider.base_url) Spider.queue = file_to_set(Spider.queue_file) Spider.crawled = file_to_set(Spider.crawled_file) # Updates user display, fills queue and updates files @staticmethod def crawl_page(thread_name, page_url): if page_url notin Spider.crawled: print(thread_name +' now crawling '+ page_url) print('Queue '+str(len(Spider.queue)) +' | Crawled '+str(len(Spider.crawled))) Spider.add_links_to_queue(Spider.gather_links(page_url)) Spider.queue.remove(page_url) Spider.crawled.add(page_url) Spider.update_files() # Converts raw response data into readable information and checks for proper html formatting @staticmethod def gather_links(page_url): html_string ='' try: response = urlopen(page_url) if'text/html'in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string)
def ElementFinder(url): logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) logging.info('Running element finder') driver = webdriver.Chrome(executable_path='chromedriver.exe') links_to_be_checked = LinkFinder(url) test_pages = [] #for link in links_to_be_checked[3:4]: for link in links_to_be_checked: logging.info('Checking URL ' + link) driver.get(link) page_source = driver.page_source soup = BeautifulSoup(page_source, 'html.parser') page_elems_list = [] for id, a in enumerate(soup.find_all()): elems = a.attrs elems['element_id'] = id elems['element_name'] = a.name elems['element_text'] = a.text elems['page_link'] = link elems['element_points_title'] = 0 elems['element_points_content'] = 0 elems['element_index'] = 0 page_elems_list.append(elems) test_pages.append(page_elems_list) return test_pages
def gather_links(page_url): html_string = '' try: #Some websites dislike being browsed by programs, this ensure to overpass that problem by pretending # this is a normal user user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' values = {'name' : 'Rosa Foord', 'location' : 'Lyon', 'language' : 'Python' } headers = { 'User-Agent' : user_agent } data = urllib.parse.urlencode(values) data = data.encode('utf-8') req = urllib.request.Request(page_url, data, headers) response = urllib.request.urlopen(req) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) #dataRetriever = DataRetriever(Spider.base_url, page_url) #dataRetriever.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(thread_name, page_url): data = {} try: user_agent_list = [ # Chrome 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', # Firefox 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)' ] headers = {'User-Agent': random.choice(user_agent_list)} response = requests.get(page_url, headers=headers) if 'text/html' in response.headers['Content-Type']: soup = BeautifulSoup(response.text, 'lxml') title = soup.find('title').text keyword = soup.find('meta', attrs={'name': 'keywords'})['content'] description = soup.find('meta', attrs={'name': 'description'})['content'] data = { 'title': title, 'meta_keywords': keyword, 'meta_description': description, 'page_url': page_url } print("Data fetched from {} : \n".format(thread_name), data) finder = LinkFinder(Spider.base_url, page_url) finder.feed(response.text) Spider.send_data_to_es(thread_name, data) del data else: return set() except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(html_string, page_url): try: finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page_url): try: response = urlopen(page_url) html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error') return set() return finder.page_links()
def __init__(self, crawl_queue: Queue, seen_urls: set, processed_urls: set, rank_queue: Queue , depth_limit, domain_name, crawl_queue_time_out: int, logger): self.logger = logger self.crawl_queue = crawl_queue self.seen_urls = seen_urls self.processed_urls = processed_urls self.rank_queue = rank_queue self.depth_limit = depth_limit self.domain_name = domain_name self.link_finder = LinkFinder() self.crawl_queue_time_out = crawl_queue_time_out
def gather_links(page_url): try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error: cannot crawl page ' + page_url) return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if response.getheader('Content-Type') == 'text/html': html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = requests.get(page_url) if 'text/html' in response.headers['Content-Type']: html_string = response.text finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(thread_name, page_url): try: response = urlopen(page_url) html_bytes = response.read() html_decompressed = zlib.decompress(html_bytes, 16 + zlib.MAX_WBITS) html_string = html_decompressed.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: try: response = urlopen(page_url) html_bytes = response.read() html_string = html_bytes.decode('utf-8') #print(html_string) finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print(thread_name + ' cannot crawl ' + page_url) return set() return finder.page_links()
def gatherLinks(pageUrl): htmlText = '' returnlinks = set() try: request=Request(pageUrl,None,Pagerunner.headers) #The assembled request response = urlopen(request) returnheader = response.getheader('Content-Type') htmlBytes = response.read() Pagerunner.addResponse((pageUrl, response)) if 'text/html' in returnheader: htmlText = htmlBytes.decode('utf-8') finder = LinkFinder(Pagerunner.startAddress, pageUrl) finder.feed(htmlText) foundlinks = finder.page_links() returnlinks = foundlinks #print(returnlinks) response.close() Pagerunner.visited.add(pageUrl) except URLError as e: print(str(e) + ' : ' + pageUrl) returnlinks = set() except UnicodeDecodeError as e: print(str(e) + ' : ' + pageUrl) returnlinks = set() except UnicodeEncodeError as e: print(str(e) + ' : ' + pageUrl) returnlinks = set() except ConnectionResetError as e: print(str(e) + ' : ' + pageUrl) returnlinks = set() except ConnectionResetError as e: print(str(e) + ' : ' + pageUrl) returnlinks = set() except IncompleteRead as e: print(str(e) + ' : ' + pageUrl) returnlinks = set() finally: Pagerunner.visited.add(pageUrl) return returnlinks
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if response.getheader('Content-Type') == 'text/html': html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error: can not crawl page') return set() #IT NEEDS SOMETHING RETURNED, SO WE JUST RETURN EMPTY return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if response.getheader('Content-Type') == 'text/html': html_bytes = response.read() tml_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('ERROR: CAN NOT CRAWL, WEBSITE COULD BE UNREACHABLE') return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if response.info().gettype() == 'text/html': html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except IOError, e: print('Error: cannot crawl page') print(e) return list()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) #Fetch the page if response.getheader('Content-Type') == 'text/html': html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) return finder.page_links() except: print('Error: Can not scrawl page') return set()
def gather_links(page_url): html_string = "" try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url,page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page_url): html_string = "" try: response = urlopen(page_url) if response.getheader("Content-Type") == "text/html": html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print("Error: unable to crawl page") return set() return finder.page_links()