def gather_links(page_url): html_string = '' returnlinks = set() try: user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' headers = {'Connection' : 'keep-alive', 'User-Agent': user_agent,} request=Request(page_url,None,headers) #The assembled request response = urlopen(request) returnheader = response.getheader('Content-Type') html_bytes = response.read() if 'text/html' in returnheader: html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) foundlinks = finder.page_links() #returnlinks = foundlinks returnlinks = Spider.cull(foundlinks, page_url, response) response.close() except URLError: print('error encountered, most likely a 404\n') return set() return returnlinks
def gather_links(page_url): html_string = "" try: response = urlopen(page_url) if "text/html" in response.getheader("content-Type"): zipped_html_bytes = response.read() if Spider.html_gzipped: try: html_bytes = gzip.decompress(zipped_html_bytes) except IOError: Spider.html_gzipped = False html_bytes = zipped_html_bytes else: html_bytes = zipped_html_bytes try: html_string = html_bytes.decode("utf-8") except UnicodeDecodeError: try: html_string = html_bytes.decode("gbk") except Exception as e: print(e) finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(e) print("Error: can not craw page.") return set() response.close() return finder.page_links()
def crawl_page(thread_name, page_url): if (page_url not in (Spider.crawled | Spider.finish)): finder = LinkFinder(Spider.base_url, page_url) Spider.add_links_to_queue_finish(finder.links()) Spider.queue.remove(page_url) Spider.crawled.add(page_url) Spider.update_files()
def gather_links(page_url): html_string = '' try: request = urllib2.Request(page_url) response = urllib2.urlopen(request) # response = urllib2.urlopen(page_url) u = response.info().getheader('Content-Type') print u # print Spider.Type if u.find(Spider.Type) != -1: vv = page_url if vv not in Spider.downloaded: download_file('./' + Spider.projectname, vv) Spider.downloaded.add(vv) if u.startswith('text/html'): if Spider.Type == "image": Spider.get_images(page_url) html_bytes = response.read() html_string = html_bytes.decode('utf-8') # print "here" finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception, err: # print('Error: cannot crawl page') print Exception, err return set()
def gather_links(page_url): html_string = '' try: #Some websites dislike being browsed by programs, this ensure to overpass that problem by pretending # this is a normal user user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' values = {'name' : 'Rosa Foord', 'location' : 'Lyon', 'language' : 'Python' } headers = { 'User-Agent' : user_agent } data = urllib.parse.urlencode(values) data = data.encode('utf-8') req = urllib.request.Request(page_url, data, headers) response = urllib.request.urlopen(req) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) #dataRetriever = DataRetriever(Spider.base_url, page_url) #dataRetriever.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page_url): try: finder = LinkFinder(Spider.base_url, page_url) finder.feed(CustomConnection.URL(page_url)) except: return set() return finder.page_links()
def gather_links(page_url): # Create a variable/object to store HTML request's response html_string = '' # Enclose in a try-except block to handle exceptions during connections try: # Get the response after trying to connect to a webpage response = urlopen(page_url) # Check if response contains, text/html as Content-Type in header if 'text/html' in response.getheader('Content-Type'): # Read the response byte wise html_bytes = response.read() # Decode the response from byte order to human readable format # And store in variable/object created earlier to store response html_string = html_bytes.decode('utf-8') # Create a LinkFinder() object to start parsing webpages finder = LinkFinder(Spider.base_url, page_url) # Start parsing webpages using HTMLParser class's feed function finder.feed(html_string) # Catch exception except Exception as e: # Print exception info to console print(str(e)) # Since exception occured, return empty set() object return set() # If all operations are successful, return results return finder.page_links()
def gather_links_and_text(page_url): if page_url in Spider.crawled: Spider.queue.remove(page_url) print("***************************** Duplicate found!!!!!!!!!!!!!!!!") return set() else: html_string = '' try: article = Article(page_url, language='bn') article.download() article.parse() html_string = article.html Spider.news += article.title + '\n' + article.text Spider.page_count += 1 file = codecs.open(Spider.html_pages + randomString(8) + '.html', "a", "utf-8") file.write(html_string) file.close() if Spider.page_count % 100 == 0: with codecs.open(Spider.project_name + '/all_texts.txt', "a", "utf-8") as w: for l in Spider.news: w.write(l) Spider.news = "" # find the links finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page_url): """ connects to site takes the html converts from html bytes to proper readable string passes to LinkFinder, LinkFinder parses throught and get all the links of the url. if theres no error then return., else it will return an empty set with the message "error: cannot crawl page!" """ html_string = '' #using error catching on networking try: response = urlopen(page_url) #make sure its a html page and not some pdf format if 'text/html' in response.getheader('Content-Type'): #python read in html bytes format html_bytes = response.read() #convert into human readable character (utf-8) html_string = html_bytes.decode('utf-8') #create a linkfinder object finder = LinkFinder(Spider.base_url, page_url) #feed in the html strings finder.feed(html_string) except: print('Error: cannot crawl page!') return set() return finder.page_links()
def boot(): create_project_dir(Spider.project_name) create_data_files(Spider.project_name, Spider.base_url) Spider.queue = file_to_set(Spider.queue_file) Spider.crawled = file_to_set(Spider.crawled_file) # Updates user display, fills queue and updates files @staticmethod def crawl_page(thread_name, page_url): if page_url notin Spider.crawled: print(thread_name +' now crawling '+ page_url) print('Queue '+str(len(Spider.queue)) +' | Crawled '+str(len(Spider.crawled))) Spider.add_links_to_queue(Spider.gather_links(page_url)) Spider.queue.remove(page_url) Spider.crawled.add(page_url) Spider.update_files() # Converts raw response data into readable information and checks for proper html formatting @staticmethod def gather_links(page_url): html_string ='' try: response = urlopen(page_url) if'text/html'in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string)
def gather_links(page_url): html_string = Spider.connect(page_url) if html_string is None: return set() # if there is an error return an empty set finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) # pass in html data return finder.page_links() # if there is no error return the set of page links
def gather_links(page_url): html_string = '' try: print("urlopen("+page_url+Spider.suffix+")") response = urlopen(page_url+Spider.suffix) #if response.getheader('Content-Type') == 'text/html': html_bytes = response.read() html_string = html_bytes.decode("utf-8") print('page_url = '+page_url) urlElems = page_url.split('/') fileName = Spider.project_name +'/'+urlElems[-1]+'.html' print("save to "+fileName) with open(fileName, 'w') as f: f.write(html_string) #else: # print('Failed to get Content-Type') finder = LinkFinder(Spider.base_url, page_url, Spider.ahref_class) finder.feed(html_string) converter = HTMLToTXTConverter() converter.feed(html_string) fileName = Spider.project_name +'/'+urlElems[-1]+'.txt' print("save to "+fileName) with open(fileName, 'w') as f: f.write(converter.getText()) except: e = sys.exc_info()[0] print(e) print('Error: can not crawl page') return set() return finder.page_links()
def gather_links(thread_name, page_url): data = {} try: user_agent_list = [ # Chrome 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', # Firefox 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)' ] headers = {'User-Agent': random.choice(user_agent_list)} response = requests.get(page_url, headers=headers) if 'text/html' in response.headers['Content-Type']: soup = BeautifulSoup(response.text, 'lxml') title = soup.find('title').text keyword = soup.find('meta', attrs={'name': 'keywords'})['content'] description = soup.find('meta', attrs={'name': 'description'})['content'] data = { 'title': title, 'meta_keywords': keyword, 'meta_description': description, 'page_url': page_url } print("Data fetched from {} : \n".format(thread_name), data) finder = LinkFinder(Spider.base_url, page_url) finder.feed(response.text) Spider.send_data_to_es(thread_name, data) del data else: return set() except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page_url): try: finder = LinkFinder(Spider.base_url, page_url) finder.getAllExternalLinks(page_url) except: print('Error : can not crawl page') return set() return finder.page_internalLink()
def gather_links(html_string, page_url): try: finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page_url): try: response = urlopen(page_url) html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error') return set() return finder.page_links()
def __init__(self, crawl_queue: Queue, seen_urls: set, processed_urls: set, rank_queue: Queue , depth_limit, domain_name, crawl_queue_time_out: int, logger): self.logger = logger self.crawl_queue = crawl_queue self.seen_urls = seen_urls self.processed_urls = processed_urls self.rank_queue = rank_queue self.depth_limit = depth_limit self.domain_name = domain_name self.link_finder = LinkFinder() self.crawl_queue_time_out = crawl_queue_time_out
def gather_links(page_url): html_string = '' try: response = requests.get(page_url) if 'text/html' in response.headers['Content-Type']: html_string = response.text finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page_url): try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error: cannot crawl page ' + page_url) return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if response.getheader('Content-Type') == 'text/html': html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_string = response.read().decode('utf-8') finder = LinkFinder(Spider.base_url, Spider.page_url) finder.feed(html_string) except Exception as e: print('Error: can not crawl page| ', e) return set() return finder.page_links()
def gatherLinks(pageUrl): htmlText = '' returnlinks = set() try: request=Request(pageUrl,None,Pagerunner.headers) #The assembled request response = urlopen(request) returnheader = response.getheader('Content-Type') htmlBytes = response.read() Pagerunner.addResponse((pageUrl, response)) if 'text/html' in returnheader: htmlText = htmlBytes.decode('utf-8') finder = LinkFinder(Pagerunner.startAddress, pageUrl) finder.feed(htmlText) foundlinks = finder.page_links() returnlinks = foundlinks #print(returnlinks) response.close() Pagerunner.visited.add(pageUrl) except URLError as e: print(str(e) + ' : ' + pageUrl) returnlinks = set() except UnicodeDecodeError as e: print(str(e) + ' : ' + pageUrl) returnlinks = set() except UnicodeEncodeError as e: print(str(e) + ' : ' + pageUrl) returnlinks = set() except ConnectionResetError as e: print(str(e) + ' : ' + pageUrl) returnlinks = set() except ConnectionResetError as e: print(str(e) + ' : ' + pageUrl) returnlinks = set() except IncompleteRead as e: print(str(e) + ' : ' + pageUrl) returnlinks = set() finally: Pagerunner.visited.add(pageUrl) return returnlinks
def gather_links(url): html = '' try: response = urlopen(url) if response.getheader('Content-Type') == 'text/html': html_bytes = response.read() html = html_bytes.decode("utf-8") finder = LinkFinder(Spider.url, url) finder.feed(html) except: print("Error: Can't crawl page") return set() return finder.page_links()
def gather_links(page_url): # html_string = '' try: response = requests.get(page_url, proxies=Spider.proxy) # if 'text/html' in response.getheader('Content-Type'): html_bytes = response.text html_string = html_bytes.encode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print str(e) return set() return finder.page_links()
def gather_links(page_url): html_string = '' # convert bits to string try: response = urlopen(page_url) if response.getheader('Content-Type') == 'application/pdf' or 'text/html': html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) # parses HTML and returns links except: print('Error: cannot crawl page') return set() # no links on the page, return empty set return finder.page_links()
def gather_links(page_url: str) -> Set[str]: html_string = "" try: response = urlopen(page_url) if "text/html" in response.getheader("Content-Type"): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print("Error : Can't crawl page") return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) #goes to page url if 'text/html' in response.getheader('Content-Type'): #checks the content type html_bytes = response.read() #reads the html html_string = html_bytes.decode("utf-8") # decodes html finder = LinkFinder(spider.homepage_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page__url): html_string = "" try: response = urlopen(page__url) if response.getheader("Content-Type") == "text/html": html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page__url) finder.feed(html_string) except: print("Error: cannot crawl page") return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if response.info().gettype() == 'text/html': html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except IOError, e: print('Error: cannot crawl page') print(e) return list()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if response.getheader('Content-type') == 'text/html; charset=utf-8': html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error: can not crawl page') return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if response.getheader('Content-Type') == 'text/html': html_bytes = response.read() tml_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('ERROR: CAN NOT CRAWL, WEBSITE COULD BE UNREACHABLE') return set() return finder.page_links()
def gather_links(page_url): html_string = "" try: response = urlopen(page_url) if response.getheader("Content-Type") == "text/html": html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print("Error: unable to crawl page") return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page_url): html_string = "" try: response = urlopen(page_url) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url,page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_links(page_url): html_string = '' #try: response = urlopen(page_url) #if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes finder = LinkFinder(Spider.base_url, page_url) links = finder.parseAndGetLinks(html_string) '''except Exception as e: print(str(e)) return set()''' return links
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) #Fetch the page if response.getheader('Content-Type') == 'text/html': html_bytes = response.read() html_string = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) return finder.page_links() except: print('Error: Can not scrawl page') return set()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if response.getheader('Content-Type') == 'text/html': html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error: can not crawl page') return set() #IT NEEDS SOMETHING RETURNED, SO WE JUST RETURN EMPTY return finder.page_links()
def gather_links(page_url): html_string = '' try: response = requests.get(page_url) if 'text/html' in response.headers['Content-Type']: html_string = str(response.content) finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(e) print('Error: can not crawl page') return set() return finder.page_links()
def gather_links(page_url): html_str = '' try: request = Request(page_url, headers=Spider.headers) response = urlopen(request) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_str = html_bytes.decode('utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_str) except: print('Cannot access ' + page_url) return set() return finder.page_links()
def gather_link(page_rul): html_string = '' try: response =urlopen(page_url) if response.getheader('content-type'=='text/html'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_rul,Spider.page_url) finder.feed(html_bytes) except: print("error") return set() return finder.page_links
def gather_links(page_url): html_str='' try: response=urlopen(page_url) if 'text/html' in response.info().getheader('Content-Type'): html_bytes=response.read() html_string=html_bytes.decode("utf-8") finder=LinkFinder(Spider.base_url) finder.feed(html_string) # 返回爬取的url集合 return finder.get_links(); except: print('Error:can not crawl page.') return set()
def gather_links(page_url): html_string = '' try: header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0' } response=requests.get(page_url,header) header=response.headers['Content-Type'] if header=='text/html; charset=utf-8': html_string=response.text finder=LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error: can not crawl page') return set() return finder.page_links()
def gather_links(page_url): html_string = '' try: response = urlopen(page_url) if response.getheader('Content-Type') == 'text/html': html_bytes = response.read() html_string = html_bytes.decode(encoding='utf-8') elif response.getheader('content-type') == 'text/html;charset=utf-8': html_bytes = response.read() html_string = html_bytes.decode(encoding='utf-8') finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print('\nException : ' + str(e) + '\n') return set() return finder.page_links()
def gather_link(page_url): html_string = '' try: response = urlopen(page_url) # convert bytes from the python parsing data to human readable data if response.info()['Content-type']=='text/html' or \ response.info()['content-type'] == 'text/html; charset=utf-8' or \ response.info()['Content-type'] == 'text/html; charset=utf-8' or \ response.info()['Content-type'] == 'text/html; charset=UTF-8': html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(PySpider.base_url, page_url) finder.feed(html_string) except: print 'Error: can not crawl page' return set() return finder.page_links()
class LinkFinderTest(unittest.TestCase): def setUp(self): self.search_url = "http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_22/" self.link_finder = LinkFinder(self.search_url) def testUrlIsFound(self): self.assertIsNotNone(self.link_finder.get_link())
def gather_links(page_url): html_string = '' try: #response = urlopen(page_url) # make sure we are connecting to an actual website #if response.getheader('Content-Type') == 'text/html': # html_bytes = response.read() # html_string = html_bytes.decode('utf-8') html_string = requests.get(page_url).text finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error: Cannot crawl page - {}'.format(page_url)) return set() return finder.get_page_links()
def gather_link(page_url): html_string = '' finder = '' # goto website, get the byte data convert to string # pass it through to linkfinder, and find all the links html_string = '' try: response = urlopen(page_url) if 'text/html' in response.info().headers: html_bytes = response.read() html_string = html_bytes.decode("utf-8") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except Exception as e: print(str(e)) return set() return finder.page_links()
def gather_link(page_url): html_string = '' # urlopen returns byte data which we have to turn into a readable string try: response = urlopen(page_url) # make sure it is html data (in case we crawl a pdf file) if response.getheader('Content-Type') == 'text/html': html_bytes = response.read() html_string = html_bytes.decode("utf-a") finder = LinkFinder(Spider.base_url, page_url) finder.feed(html_string) except: print('Error: Cannot crawl page') # Return empty set if we cannot crawl the link return set() return finder.page_links()
class Friends_finder(): def __init__(self, user_name, password): self.user_name = user_name self.password = password self.facebook_url = "https://www.facebook.com/" self.more_clicks = 0 self.existent_people_links = set() self.setup() self.log_in() while 1: self.scroll_down_mannualy() self.gather_links() self.append_links_to_queue() def setup(self): print('Seting up WebDriver') self.driver = webdriver.Firefox() self.driver.get(self.facebook_url) def log_in(self): ready = False while ready == False: ready = True try: self.driver.find_element_by_id("email").send_keys(self.user_name) except: ready = False self.driver.find_element_by_id("pass").send_keys(self.password) self.driver.find_element_by_id("pass").send_keys(Keys.RETURN) sleep(2) try: self.driver.find_element_by_xpath('//*[@id="u_0_2"]') print('Conected') except: print('Unable to conect, Please do it manually') ready = False while ready == False: try: self.driver.find_element_by_xpath('//*[@id="u_0_2"]') ready = True except: pass def scroll_down_mannualy(self): print("please scroll down the page") print("When done, press any key to start gathering links") input() def gather_links(self): print('gathering links, please wait ...') self.link_finder = LinkFinder() self.link_finder.feed(self.driver.page_source) self.gathered_links = self.link_finder.get_links() print( str(len(self.gathered_links)) + ' Links was gathered') def append_links_to_queue(self): print('Apending links and updating the queue file...') self.get_existent_links() self.update_queue() def get_existent_links(self): with open("data/people_to_add.txt", "r") as f: for line in f: self.existent_people_links.add(line.replace('\n', '')) with open("data/errors.txt", "r") as f: for line in f: self.existent_people_links.add(line.replace('\n', '')) with open("data/added_friends.txt", "r") as f: for line in f: self.existent_people_links.add(line.replace('\n', '')) def update_queue(self): self.new_links_added = 0 with open("data/people_to_add.txt", "a") as f: for item in self.gathered_links: if item not in self.existent_people_links: self.new_links_added += 1 f.write(item + '\n') print( str(self.new_links_added) + ' Items were added to the queue file')
def gather_links(self): print('gathering links, please wait ...') self.link_finder = LinkFinder() self.link_finder.feed(self.driver.page_source) self.gathered_links = self.link_finder.get_links() print( str(len(self.gathered_links)) + ' Links was gathered')
def setUp(self): self.search_url = "http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_22/" self.link_finder = LinkFinder(self.search_url)