def busca_link(): response=requests.get("http://www.google.com") html = parser(response.text) soup = BeautifulSoup(response.text,"lxml") print("Links") all_links=soup.find_all("a") for link in all_links: print(link.get("href"))
def process(self, document_field, extract_dict, **kwargs): """XPath-based extraction was applied to this document """ # logger.debug("field={field}, extract_dict={extract_dict}".format(**locals())) if type(extract_dict) == str: extract_dict = self._parse_strings_that_contain_dicts(extract_dict) try: parsed_document = parser(document_field) except: try: logger.info("attempting to process after utf-8 encoding") parsed_document = parser( document_field.encode("utf-8", "replace")) except: logger.warning( "failed to parse document! using xpath_parser, dict={extract_dict}" .format(**locals())) parsed_fields = parse_dict(parsed_document, extract_dict) return parsed_fields
def handle_page_unit (self, unit): url = self.page_counter.construct_url(unit) try: return self.get_elements(parser( self.get_page_content(url) )) except HTTPError as e: stderr.write( "!!! Page '{0}' is unavailable [{1.code}]".format(url, e) ) if page is None: stderr.write('*** Problems with {0}. Please check'.format(url)) return tuple()
def parse_page (self, url, results): for attempt in xrange(self.tries): try: results.append(self.start_parse_page( url, parser(self.get_page_content(url)) )) except ValueError: # if self.get_page_content returns None and parser dies pass except (HTTP404, IndexError): # IndexError raised by lxml.etree in order if parsing was unsuccessful print 'Some problems with {0}. Please check'.format(url) else: print 'Some problems with {0}. Please check'.format(url)