コード例 #1
0
	def gather_links(page_url):
		html_string = ''
		returnlinks = set()
		

		try:	
			user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
			headers = {'Connection' : 'keep-alive', 'User-Agent': user_agent,}
			request=Request(page_url,None,headers) #The assembled request
			response = urlopen(request)
			returnheader = response.getheader('Content-Type')	
			html_bytes = response.read()
			




			if 'text/html' in returnheader:
			
				html_string = html_bytes.decode("utf-8")
				finder = LinkFinder(Spider.base_url, page_url)
				finder.feed(html_string)
				foundlinks = finder.page_links()
				#returnlinks = foundlinks
				returnlinks = Spider.cull(foundlinks, page_url, response) 
			
			response.close()
		
		except URLError:
			print('error encountered, most likely a 404\n')
			return set()			
		return returnlinks 
コード例 #2
0
ファイル: spider.py プロジェクト: safetychinese/link_crawler
    def gather_links(page_url):
        html_string = ""
        try:
            response = urlopen(page_url)

            if "text/html" in response.getheader("content-Type"):
                zipped_html_bytes = response.read()
                if Spider.html_gzipped:
                    try:
                        html_bytes = gzip.decompress(zipped_html_bytes)
                    except IOError:
                        Spider.html_gzipped = False
                        html_bytes = zipped_html_bytes
                else:
                    html_bytes = zipped_html_bytes
                try:
                    html_string = html_bytes.decode("utf-8")
                except UnicodeDecodeError:
                    try:
                        html_string = html_bytes.decode("gbk")
                    except Exception as e:
                        print(e)
            finder = LinkFinder(Spider.base_url, page_url)
            finder.feed(html_string)
        except Exception as e:
            print(e)
            print("Error: can not craw page.")
            return set()
        response.close()
        return finder.page_links()
コード例 #3
0
 def crawl_page(thread_name, page_url):
     if (page_url not in (Spider.crawled | Spider.finish)):
         finder = LinkFinder(Spider.base_url, page_url)
         Spider.add_links_to_queue_finish(finder.links())
         Spider.queue.remove(page_url)
         Spider.crawled.add(page_url)
         Spider.update_files()
コード例 #4
0
ファイル: spider.py プロジェクト: gayensouvik1/spidy
    def gather_links(page_url):
        html_string = ''
        try:
            request = urllib2.Request(page_url)
            response = urllib2.urlopen(request)
            # response = urllib2.urlopen(page_url)
            u = response.info().getheader('Content-Type')
            print u
            # print Spider.Type

            if u.find(Spider.Type) != -1:
                vv = page_url
                if vv not in Spider.downloaded:
                    download_file('./' + Spider.projectname, vv)
                    Spider.downloaded.add(vv)

            if u.startswith('text/html'):
                if Spider.Type == "image":
                    Spider.get_images(page_url)
                html_bytes = response.read()
                html_string = html_bytes.decode('utf-8')

                # print "here"

            finder = LinkFinder(Spider.base_url, page_url)
            finder.feed(html_string)
        except Exception, err:
            # print('Error: cannot crawl page')
            print Exception, err
            return set()
コード例 #5
0
	def gather_links(page_url):
		html_string = ''
		try:
			#Some websites dislike being browsed by programs, this ensure to overpass that problem by pretending
			# this is a normal user
			user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
			values = {'name' : 'Rosa Foord',
			          'location' : 'Lyon',
			          'language' : 'Python' }
			headers = { 'User-Agent' : user_agent }

			data  = urllib.parse.urlencode(values)
			data = data.encode('utf-8')
			req = urllib.request.Request(page_url, data, headers)
			response = urllib.request.urlopen(req)
			if 'text/html' in response.getheader('Content-Type'):
				html_bytes = response.read()
				html_string = html_bytes.decode('utf-8')
			finder = LinkFinder(Spider.base_url, page_url)
			finder.feed(html_string)
			#dataRetriever = DataRetriever(Spider.base_url, page_url)
			#dataRetriever.feed(html_string)
		except Exception as e:
			print(str(e))
			return set()
		return finder.page_links()
コード例 #6
0
ファイル: spider.py プロジェクト: beaupedraza/Spider
 def gather_links(page_url):
     try:
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(CustomConnection.URL(page_url))
     except:
         return set()
     return finder.page_links()
コード例 #7
0
ファイル: spider.py プロジェクト: cryptogun/link_crawler
    def gather_links(page_url):
        html_string = ""
        try:
            response = urlopen(page_url)

            if "text/html" in response.getheader("content-Type"):
                zipped_html_bytes = response.read()
                if Spider.html_gzipped:
                    try:
                        html_bytes = gzip.decompress(zipped_html_bytes)
                    except IOError:
                        Spider.html_gzipped = False
                        html_bytes = zipped_html_bytes
                else:
                    html_bytes = zipped_html_bytes
                try:
                    html_string = html_bytes.decode("utf-8")
                except UnicodeDecodeError:
                    try:
                        html_string = html_bytes.decode("gbk")
                    except Exception as e:
                        print(e)
            finder = LinkFinder(Spider.base_url, page_url)
            finder.feed(html_string)
        except Exception as e:
            print(e)
            print("Error: can not craw page.")
            return set()
        response.close()
        return finder.page_links()
コード例 #8
0
 def gather_links(page_url):
     # Create a variable/object to store HTML request's response
     html_string = ''
     # Enclose in a try-except block to handle exceptions during connections
     try:
         # Get the response after trying to connect to a webpage
         response = urlopen(page_url)
         # Check if response contains, text/html as Content-Type in header
         if 'text/html' in response.getheader('Content-Type'):
             # Read the response byte wise
             html_bytes = response.read()
             # Decode the response from byte order to human readable format
             # And store in variable/object created earlier to store response
             html_string = html_bytes.decode('utf-8')
         # Create a LinkFinder() object to start parsing webpages
         finder = LinkFinder(Spider.base_url, page_url)
         # Start parsing webpages using HTMLParser class's feed function
         finder.feed(html_string)
     # Catch exception
     except Exception as e:
         # Print exception info to console
         print(str(e))
         # Since exception occured, return empty set() object
         return set()
     # If all operations are successful, return results
     return finder.page_links()
コード例 #9
0
ファイル: spider.py プロジェクト: ShuvenduBikash/web_crawler
    def gather_links_and_text(page_url):
        if page_url in Spider.crawled:
            Spider.queue.remove(page_url)
            print("***************************** Duplicate found!!!!!!!!!!!!!!!!")
            return set()

        else:
            html_string = ''
            try:
                article = Article(page_url, language='bn')
                article.download()
                article.parse()

                html_string = article.html
                Spider.news += article.title + '\n' + article.text

                Spider.page_count += 1
                file = codecs.open(Spider.html_pages + randomString(8) + '.html', "a", "utf-8")
                file.write(html_string)
                file.close()

                if Spider.page_count % 100 == 0:
                    with codecs.open(Spider.project_name + '/all_texts.txt', "a", "utf-8") as w:
                        for l in Spider.news:
                            w.write(l)
                    Spider.news = ""

                # find the links
                finder = LinkFinder(Spider.base_url, page_url)
                finder.feed(html_string)

            except Exception as e:
                print(str(e))
                return set()
            return finder.page_links()
コード例 #10
0
    def gather_links(page_url):
        """
        connects to site
        takes the html converts from html bytes to proper readable string
        passes to LinkFinder, LinkFinder parses throught and get all the links of the url.
        if theres no error then return., else it will return an empty set with the message
        "error: cannot crawl page!"
        """
        html_string = ''
        #using error catching on networking
        try:
            response = urlopen(page_url)

            #make sure its a html page and not some pdf format
            if 'text/html' in response.getheader('Content-Type'):
                #python read in html bytes format
                html_bytes = response.read()
                #convert into human readable character (utf-8)
                html_string = html_bytes.decode('utf-8')
                #create a linkfinder object
            finder = LinkFinder(Spider.base_url, page_url)
            #feed in the html strings
            finder.feed(html_string)
        except:
            print('Error: cannot crawl page!')
            return set()
        return finder.page_links()
コード例 #11
0
	def boot():
        create_project_dir(Spider.project_name)
        create_data_files(Spider.project_name, Spider.base_url)
        Spider.queue = file_to_set(Spider.queue_file)
        Spider.crawled = file_to_set(Spider.crawled_file)
	
# Updates user display, fills queue and updates files
	@staticmethod
	def crawl_page(thread_name, page_url):
        if page_url notin Spider.crawled:
            print(thread_name +' now crawling '+ page_url)
            print('Queue '+str(len(Spider.queue)) +' | Crawled  '+str(len(Spider.crawled)))
            Spider.add_links_to_queue(Spider.gather_links(page_url))
            Spider.queue.remove(page_url)
            Spider.crawled.add(page_url)
            Spider.update_files()
	

	# Converts raw response data into readable information and checks for proper html formatting
	@staticmethod
	def gather_links(page_url):
	        html_string =''
	try:
	            response = urlopen(page_url)
	if'text/html'in response.getheader('Content-Type'):
	                html_bytes = response.read()
	                html_string = html_bytes.decode("utf-8")
	            finder = LinkFinder(Spider.base_url, page_url)
	            finder.feed(html_string)
コード例 #12
0
 def gather_links(page_url):
     html_string = Spider.connect(page_url)
     if html_string is None:
         return set()                                        # if there is an error return an empty set
     finder = LinkFinder(Spider.base_url, page_url)
     finder.feed(html_string)                                # pass in html data
     return finder.page_links()                              # if there is no error return the set of page links
コード例 #13
0
ファイル: spider.py プロジェクト: hbdhj/python
    def gather_links(page_url):
        html_string = ''
        try:
            print("urlopen("+page_url+Spider.suffix+")")
            response = urlopen(page_url+Spider.suffix)
            #if response.getheader('Content-Type') == 'text/html':
            html_bytes = response.read()
            html_string = html_bytes.decode("utf-8")
            print('page_url = '+page_url)
            urlElems = page_url.split('/')
            fileName = Spider.project_name +'/'+urlElems[-1]+'.html'
            print("save to "+fileName)
            with open(fileName, 'w') as f:
                f.write(html_string)
            #else:
            #    print('Failed to get Content-Type')
            finder = LinkFinder(Spider.base_url, page_url, Spider.ahref_class)
            finder.feed(html_string)

            converter = HTMLToTXTConverter()
            converter.feed(html_string)
            fileName = Spider.project_name +'/'+urlElems[-1]+'.txt'
            print("save to "+fileName)
            with open(fileName, 'w') as f:
                f.write(converter.getText())

        except:
            e = sys.exc_info()[0]
            print(e)
            print('Error: can not crawl page')
            return set()
        return finder.page_links()
コード例 #14
0
ファイル: spider.py プロジェクト: tutu86/Spider
 def gather_links(page_url):
     try:
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(CustomConnection.URL(page_url))
     except:
         return set()
     return finder.page_links()
コード例 #15
0
    def gather_links(thread_name, page_url):
        data = {}
        try:
            user_agent_list = [
                # Chrome
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
                'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
                'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
                'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
                'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
                'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
                'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
                # Firefox
                'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
                'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
                'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
                'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
                'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
                'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
                'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
                'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
                'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
                'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
                'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
                'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
                'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
            ]
            headers = {'User-Agent': random.choice(user_agent_list)}

            response = requests.get(page_url, headers=headers)

            if 'text/html' in response.headers['Content-Type']:
                soup = BeautifulSoup(response.text, 'lxml')
                title = soup.find('title').text
                keyword = soup.find('meta', attrs={'name':
                                                   'keywords'})['content']
                description = soup.find('meta',
                                        attrs={'name':
                                               'description'})['content']
                data = {
                    'title': title,
                    'meta_keywords': keyword,
                    'meta_description': description,
                    'page_url': page_url
                }
                print("Data fetched from {} : \n".format(thread_name), data)
                finder = LinkFinder(Spider.base_url, page_url)
                finder.feed(response.text)
                Spider.send_data_to_es(thread_name, data)
                del data
            else:
                return set()
        except Exception as e:
            print(str(e))
            return set()
        return finder.page_links()
コード例 #16
0
ファイル: spider.py プロジェクト: everfree19/ProjectLexicon
 def gather_links(page_url):
     try:
         finder = LinkFinder(Spider.base_url, page_url)
         finder.getAllExternalLinks(page_url)
     except:
         print('Error : can not crawl page')
         return set()
     return finder.page_internalLink()
コード例 #17
0
 def gather_links(html_string, page_url):
     try:
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except Exception as e:
         print(str(e))
         return set()
     return finder.page_links()
コード例 #18
0
 def gather_links(page_url):
     try:
         response = urlopen(page_url)
         html_bytes = response.read()
         html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except:
         print('Error')
         return set()
     return finder.page_links()
コード例 #19
0
 def __init__(self, crawl_queue: Queue, seen_urls: set, processed_urls: set, rank_queue: Queue
              , depth_limit, domain_name, crawl_queue_time_out: int, logger):
     self.logger = logger
     self.crawl_queue = crawl_queue
     self.seen_urls = seen_urls
     self.processed_urls = processed_urls
     self.rank_queue = rank_queue
     self.depth_limit = depth_limit
     self.domain_name = domain_name
     self.link_finder = LinkFinder()
     self.crawl_queue_time_out = crawl_queue_time_out
コード例 #20
0
 def gather_links(page_url):
     html_string = ''
     try:
         response = requests.get(page_url)
         if 'text/html' in response.headers['Content-Type']:
             html_string = response.text
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except Exception as e:
         print(str(e))
         return set()
     return finder.page_links()
コード例 #21
0
ファイル: spider.py プロジェクト: LeonDuan/Crawler
 def gather_links(page_url):
     try:
         response = urlopen(page_url)
         if 'text/html' in response.getheader('Content-Type'):
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except:
         print('Error: cannot crawl page ' + page_url)
         return set()
     return finder.page_links()
コード例 #22
0
ファイル: Spider.py プロジェクト: harry363/Crawler
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         if response.getheader('Content-Type') == 'text/html':
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except:
         return set()
     return finder.page_links()
コード例 #23
0
ファイル: spider.py プロジェクト: suqingdong/Sources
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         if 'text/html' in response.getheader('Content-Type'):
             html_string = response.read().decode('utf-8')
         finder = LinkFinder(Spider.base_url, Spider.page_url)
         finder.feed(html_string)
     except Exception as e:
         print('Error: can not crawl page| ', e)
         return set()
     return finder.page_links()
コード例 #24
0
ファイル: pagerunner.py プロジェクト: Veziik/hyperCrawler
	def gatherLinks(pageUrl):
		htmlText = ''
		returnlinks = set()
		try:	
			request=Request(pageUrl,None,Pagerunner.headers) #The assembled request
			response = urlopen(request)
			returnheader = response.getheader('Content-Type')	
			htmlBytes = response.read()
			Pagerunner.addResponse((pageUrl, response))

			if 'text/html' in returnheader:
			
				htmlText = htmlBytes.decode('utf-8')
				finder = LinkFinder(Pagerunner.startAddress, pageUrl)
				finder.feed(htmlText)
				foundlinks = finder.page_links() 
				returnlinks = foundlinks
				#print(returnlinks)
			

			response.close()

			Pagerunner.visited.add(pageUrl)
		
		except URLError as e:
			print(str(e) + ' : ' +  pageUrl)
			returnlinks =  set()

		except UnicodeDecodeError as e:
			print(str(e) + ' : ' +  pageUrl)
			returnlinks =  set()

		except UnicodeEncodeError as e:
			print(str(e) + ' : ' +  pageUrl)			
			returnlinks =  set()

		except ConnectionResetError as e:
			print(str(e) + ' : ' +  pageUrl)
			returnlinks =  set()

		except ConnectionResetError as e:
			print(str(e) + ' : ' +  pageUrl)
			returnlinks =  set()

		except IncompleteRead as e:
			print(str(e) + ' : ' +  pageUrl)
			returnlinks =  set()

		finally:
			Pagerunner.visited.add(pageUrl)

		return returnlinks 
コード例 #25
0
 def gather_links(url):
     html = ''
     try:
         response = urlopen(url)
         if response.getheader('Content-Type') == 'text/html':
             html_bytes = response.read()
             html = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.url, url)
         finder.feed(html)
     except:
         print("Error: Can't crawl page")
         return set()
     return finder.page_links()
コード例 #26
0
ファイル: spider.py プロジェクト: zhengjinzhj/home
 def gather_links(page_url):
     # html_string = ''
     try:
         response = requests.get(page_url, proxies=Spider.proxy)
         # if 'text/html' in response.getheader('Content-Type'):
         html_bytes = response.text
         html_string = html_bytes.encode('utf-8')
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except Exception as e:
         print str(e)
         return set()
     return finder.page_links()
コード例 #27
0
 def gather_links(page_url):
     html_string = ''  # convert bits to string
     try:
         response = urlopen(page_url)
         if response.getheader('Content-Type') == 'application/pdf' or 'text/html':
             html_bytes = response.read()
             html_string = html_bytes.decode('utf-8')
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)  # parses HTML and returns links
     except:
         print('Error: cannot crawl page')
         return set()  # no links on the page, return empty set
     return finder.page_links()
コード例 #28
0
 def gather_links(page_url: str) -> Set[str]:
     html_string = ""
     try:
         response = urlopen(page_url)
         if "text/html" in response.getheader("Content-Type"):
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except Exception as e:
         print(str(e))
         return set()
     return finder.page_links()
コード例 #29
0
ファイル: spider.py プロジェクト: Agham/Spidey
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         if 'text/html' in response.getheader('Content-Type'):
             html_bytes = response.read()
             html_string = html_bytes.decode('utf-8')
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except:
         print("Error : Can't crawl page")
         return set()
     return finder.page_links()
コード例 #30
0
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url) #goes to page url 
         if 'text/html' in response.getheader('Content-Type'): #checks the content type
             html_bytes = response.read() #reads the html
             html_string = html_bytes.decode("utf-8") # decodes html
         finder = LinkFinder(spider.homepage_url, page_url) 
         finder.feed(html_string)
     except Exception as e:
         print(str(e))
         return set()
     return finder.page_links()
コード例 #31
0
ファイル: spider.py プロジェクト: keegaz/Python
 def gather_links(page__url):
     html_string = ""
     try:
         response = urlopen(page__url)
         if response.getheader("Content-Type") == "text/html":
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page__url)
         finder.feed(html_string)
     except:
         print("Error: cannot crawl page")
         return set()
     return finder.page_links()
コード例 #32
0
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         if response.info().gettype() == 'text/html':
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except IOError, e:
         print('Error: cannot crawl page')
         print(e)
         return list()
コード例 #33
0
ファイル: spider.py プロジェクト: parkchul72/Crawler
 def gather_links(page_url):
     html_string = ''
     try:
          response = urlopen(page_url)
          if response.getheader('Content-type') == 'text/html; charset=utf-8':
              html_bytes = response.read()
              html_string = html_bytes.decode('utf-8')
          finder = LinkFinder(Spider.base_url, page_url)
          finder.feed(html_string)
     except:
         print('Error: can not crawl page')
         return set()
     return finder.page_links()
コード例 #34
0
ファイル: spider.py プロジェクト: alikoptan/spidey
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         if response.getheader('Content-Type') == 'text/html':
             html_bytes = response.read()
             tml_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except:
         print('ERROR: CAN NOT CRAWL, WEBSITE COULD BE UNREACHABLE')
         return set()
     return finder.page_links()
コード例 #35
0
 def gather_links(page_url):
     html_string = ""
     try:
         response = urlopen(page_url)
         if response.getheader("Content-Type") == "text/html":
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except:
         print("Error: unable to crawl page")
         return set()
     return finder.page_links()
コード例 #36
0
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         if 'text/html' in response.getheader('Content-Type'):
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except Exception as e:
         print(str(e))
         return set()
     return finder.page_links()
コード例 #37
0
 def gather_links(page_url):
     html_string = ""
     try:
         response = urlopen(page_url)
         if 'text/html' in response.getheader('Content-Type'):
             html_bytes = response.read()
             html_string = html_bytes.decode('utf-8')
         finder = LinkFinder(Spider.base_url,page_url)
         finder.feed(html_string)
     except Exception as e:
         print(str(e))
         return set()
     return finder.page_links()
コード例 #38
0
ファイル: spider.py プロジェクト: zangree/Spider
 def gather_links(page_url):
     html_string = ''
     #try:
     response = urlopen(page_url)
     #if 'text/html' in response.getheader('Content-Type'):
     html_bytes = response.read()
     html_string = html_bytes
     finder = LinkFinder(Spider.base_url, page_url)
     links = finder.parseAndGetLinks(html_string)
     '''except Exception as e:
         print(str(e))
         return set()'''
     return links
コード例 #39
0
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)  #Fetch the page
         if response.getheader('Content-Type') == 'text/html':
             html_bytes = response.read()
             html_string = html_bytes.decode('utf-8')
             finder = LinkFinder(Spider.base_url, page_url)
             finder.feed(html_string)
             return finder.page_links()
     except:
         print('Error: Can not scrawl page')
         return set()
コード例 #40
0
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         if response.getheader('Content-Type') == 'text/html':
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except:
         print('Error: can not crawl page')
         return set()  #IT NEEDS SOMETHING RETURNED, SO WE JUST RETURN EMPTY
     return finder.page_links()
コード例 #41
0
ファイル: spider.py プロジェクト: andreisid/python
 def gather_links(page_url):
     html_string = ''
     try:
         response = requests.get(page_url)
         if 'text/html' in response.headers['Content-Type']:
             html_string = str(response.content)
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except Exception as e:
         print(e)
         print('Error: can not crawl page')
         return set()
     return finder.page_links()
コード例 #42
0
ファイル: spider.py プロジェクト: macctown/Crawler
 def gather_links(page_url):
     html_str = ''
     try:
         request = Request(page_url, headers=Spider.headers)
         response = urlopen(request)
         if 'text/html' in response.getheader('Content-Type'):
             html_bytes = response.read()
             html_str = html_bytes.decode('utf-8')
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_str)
     except:
         print('Cannot access ' + page_url)
         return set()
     return finder.page_links()
コード例 #43
0
	def gather_link(page_rul):
		html_string = ''
		try: 
			response =urlopen(page_url)
			if response.getheader('content-type'=='text/html'):
				html_bytes = response.read()
				html_string = html_bytes.decode("utf-8")
			finder = LinkFinder(Spider.base_rul,Spider.page_url)
			finder.feed(html_bytes)
		except:
			print("error")
			return set()

		return finder.page_links
コード例 #44
0
ファイル: spider.py プロジェクト: lixiongjiu/Spider2
    def gather_links(page_url):
        html_str=''
        try:
            response=urlopen(page_url)

            if 'text/html' in response.info().getheader('Content-Type'):
                html_bytes=response.read()
                html_string=html_bytes.decode("utf-8")
            finder=LinkFinder(Spider.base_url)
            finder.feed(html_string)
            # 返回爬取的url集合
            return finder.get_links();
        except:
            print('Error:can not crawl page.')
            return set()
コード例 #45
0
ファイル: spider.py プロジェクト: lq08025107/pyspider
 def gather_links(page_url):
     html_string = ''
     try:
         header = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0'
         }
         response=requests.get(page_url,header)
         header=response.headers['Content-Type']
         if header=='text/html; charset=utf-8':
             html_string=response.text
         finder=LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except:
         print('Error: can not crawl page')
         return set()
     return finder.page_links()
コード例 #46
0
ファイル: spider.py プロジェクト: v4iv/Spider
 def gather_links(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         if response.getheader('Content-Type') == 'text/html':
             html_bytes = response.read()
             html_string = html_bytes.decode(encoding='utf-8')
         elif response.getheader('content-type') == 'text/html;charset=utf-8':
             html_bytes = response.read()
             html_string = html_bytes.decode(encoding='utf-8')
         finder = LinkFinder(Spider.base_url, page_url)
         finder.feed(html_string)
     except Exception as e:
         print('\nException : ' + str(e) + '\n')
         return set()
     return finder.page_links()
コード例 #47
0
ファイル: PySpider.py プロジェクト: yjhao/PySpider
 def gather_link(page_url):
     html_string = ''
     try:
         response = urlopen(page_url)
         # convert bytes from the python parsing data to human readable data
         if response.info()['Content-type']=='text/html' or \
                         response.info()['content-type'] == 'text/html; charset=utf-8' or \
                         response.info()['Content-type'] == 'text/html; charset=utf-8' or \
                         response.info()['Content-type'] == 'text/html; charset=UTF-8':
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = LinkFinder(PySpider.base_url, page_url)
         finder.feed(html_string)
     except:
         print 'Error: can not crawl page'
         return set()
     return finder.page_links()
コード例 #48
0
ファイル: test.py プロジェクト: ecowan/mlb-realtime
class LinkFinderTest(unittest.TestCase):

    def setUp(self):
        self.search_url = "http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_22/"
        self.link_finder = LinkFinder(self.search_url)

    def testUrlIsFound(self):
        self.assertIsNotNone(self.link_finder.get_link())
コード例 #49
0
ファイル: spider.py プロジェクト: agiridh/Web-Crawler
    def gather_links(page_url):
        html_string = ''

        try:
            #response = urlopen(page_url)  # make sure we are connecting to an actual website
            #if response.getheader('Content-Type') == 'text/html':
            #    html_bytes = response.read()
            #    html_string = html_bytes.decode('utf-8')
            html_string = requests.get(page_url).text
            finder = LinkFinder(Spider.base_url, page_url)
            finder.feed(html_string)

        except:
            print('Error: Cannot crawl page -  {}'.format(page_url))
            return set()

        return finder.get_page_links()
コード例 #50
0
ファイル: spider.py プロジェクト: iFun/WebCrawler
    def gather_link(page_url):
        html_string = ''
        finder = ''
        # goto website, get the byte data convert to string
        # pass it through to linkfinder, and find all the links
        html_string = ''
        try:
            response = urlopen(page_url)
            if 'text/html' in response.info().headers:
                html_bytes = response.read()
                html_string = html_bytes.decode("utf-8")
                finder = LinkFinder(Spider.base_url, page_url)
                finder.feed(html_string)
           
        except Exception as e:
            print(str(e))
            return set()

        return finder.page_links()
コード例 #51
0
ファイル: spider.py プロジェクト: Souloist/Projects
	def gather_link(page_url):

		html_string = ''

		# urlopen returns byte data which we have to turn into a readable string 
		try:
			response = urlopen(page_url)
			
			# make sure it is html data (in case we crawl a pdf file)
			if response.getheader('Content-Type') == 'text/html':
				html_bytes = response.read()
				html_string = html_bytes.decode("utf-a")

			finder = LinkFinder(Spider.base_url, page_url)
			finder.feed(html_string)
		except:
			print('Error: Cannot crawl page')
			# Return empty set if we cannot crawl the link
			return set()

		return finder.page_links()
コード例 #52
0
class Friends_finder():
    def __init__(self, user_name, password):
        self.user_name = user_name
        self.password = password
        self.facebook_url = "https://www.facebook.com/"
        self.more_clicks = 0
        self.existent_people_links = set()

        self.setup()
        self.log_in()
        while 1:
            self.scroll_down_mannualy()        
            self.gather_links()
            self.append_links_to_queue()
    
    def setup(self):
        print('Seting up WebDriver')
        self.driver = webdriver.Firefox()
        self.driver.get(self.facebook_url)
    
    def log_in(self):
        ready = False
        while ready == False:
            ready = True
            try:
                self.driver.find_element_by_id("email").send_keys(self.user_name)
            except:
                ready = False

        self.driver.find_element_by_id("pass").send_keys(self.password)
        self.driver.find_element_by_id("pass").send_keys(Keys.RETURN)

        sleep(2)

        try:
            self.driver.find_element_by_xpath('//*[@id="u_0_2"]')
            print('Conected')
        except:
            print('Unable to conect, Please do it manually')
            ready = False
            while ready == False:
                try:
                    self.driver.find_element_by_xpath('//*[@id="u_0_2"]')
                    ready = True
                except:
                    pass
    
    def scroll_down_mannualy(self):
        print("please scroll down the page")
        print("When done, press any key to start gathering links")
        input()


    
    def gather_links(self):
        print('gathering links, please wait ...')
        self.link_finder = LinkFinder()
        self.link_finder.feed(self.driver.page_source)
        self.gathered_links = self.link_finder.get_links()
        print( str(len(self.gathered_links)) + ' Links was gathered')

    def append_links_to_queue(self):
        print('Apending links and updating the queue file...')
        self.get_existent_links()
        self.update_queue()
 
    def get_existent_links(self):
        with open("data/people_to_add.txt", "r") as f:
            for line in f:
                self.existent_people_links.add(line.replace('\n', ''))

        with open("data/errors.txt", "r") as f:
            for line in f:
                self.existent_people_links.add(line.replace('\n', ''))

        with open("data/added_friends.txt", "r") as f:
            for line in f:
                self.existent_people_links.add(line.replace('\n', ''))


    def update_queue(self):
        self.new_links_added = 0
        with open("data/people_to_add.txt", "a") as f:
            for item in self.gathered_links:
                if item not in self.existent_people_links:
                    self.new_links_added += 1
                    f.write(item + '\n')
        print( str(self.new_links_added) + ' Items were added to the queue file')
コード例 #53
0
 def gather_links(self):
     print('gathering links, please wait ...')
     self.link_finder = LinkFinder()
     self.link_finder.feed(self.driver.page_source)
     self.gathered_links = self.link_finder.get_links()
     print( str(len(self.gathered_links)) + ' Links was gathered')
コード例 #54
0
ファイル: test.py プロジェクト: ecowan/mlb-realtime
 def setUp(self):
     self.search_url = "http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_22/"
     self.link_finder = LinkFinder(self.search_url)