def get_page_with_param(self, params): logger.debug('get_page_with_param: self.url=%s, params=%s' % (self.url, params)) if params == None: try: html_page = urllib2.urlopen( self.url).read().decode('windows-1255').encode('utf-8') except urllib2.URLError: logger.error("can't open URL: %s" % self.url) send_chat_notification(__name__, 'failed to open url', { 'url': self.url, 'params': None }) return None try: soup = BeautifulSoup(html_page) except HTMLParseError, e: logger.debug("parsing URL: %s - %s. will try harder." % (self.url, e)) html_page = re.sub( "(?s)<!--.*?-->", " ", html_page) # cut anything that looks suspicious html_page = re.sub("(?s)<script>.*?</script>", " ", html_page) html_page = re.sub("(?s)<!.*?>", " ", html_page) try: soup = BeautifulSoup(html_page) except HTMLParseError, e: logger.debug("error parsing URL: %s - %s" % (self.url, e)) send_chat_notification(__name__, 'failed to parse url', { 'url': self.url, 'params': None }) return None
def _scrape(self): try: html = self.source.fetch() soup = BeautifulSoup(html) except Exception, e: send_chat_notification(__file__, 'failed to fetch or parse the lobbyists index page', {'url': self.LOBBYISTS_INDEX_PAGE_URL}) raise e
def _scrape(self): try: html = self.source.fetch() soup = BeautifulSoup(html) except Exception, e: send_chat_notification( __file__, 'failed to fetch or parse the lobbyists index page', {'url': self.LOBBYISTS_INDEX_PAGE_URL}) raise e
def get_page(self,url): html_page = None retry_count = 0 while not(html_page): try: html_page = urllib2.urlopen(url, timeout=30).read() except urllib2.URLError: retry_count += 1 if retry_count >= 10: send_chat_notification(__name__, "URL failed too many times", {"url": url}) raise urllib2.URLError('URL %s failed too many times' % url) html_page = re.sub("(?s)<!--.*?-->"," ", html_page) # cut anything that looks suspicious html_page = re.sub("(?s)<script>.*?</script>"," ", html_page) return html_page
def _get_meetings(self, committee_id, from_date, to_date): try: meetings = DataserviceCommitteeMeeting.get(committee_id, from_date, to_date) return meetings except Exception as e: err_msg = ERR_MSG.format(committee_id) err_msg_report = ERR_MSG_REPORT.format(committee_id, str(e)) DataserviceCommitteeMeeting.error_report(err_msg, err_msg_report) self._log_error(err_msg) send_chat_notification(__name__, "Received unexpected exception from DataServiceCommitteeMeeting.get()", {'exception': traceback.format_exc(), 'committee_id': committee_id, 'from_date': from_date, 'to_date': to_date}) return []
def _get_committees_index_page(full): if full: url = FULL_URL encoding = "iso_8859_8" else: url = URL # encoding='utf8' # the encoding of this page used to be utf-8 but looks like they reverted back to iso-8859-8 encoding = "iso_8859_8" logger.info("getting index page html from " + url) try: return unicode(urllib2.urlopen(url).read(), encoding) except: logger.error("could not fetch committees_index_page, exception: " + traceback.format_exc()) send_chat_notification(__name__, "could not fetch committees index page", {"url": url}) return ""
def _get_committees_index_page(full): if full: url = FULL_URL encoding = 'iso_8859_8' else: url = PLENUM_URL # encoding='utf8' # the encoding of this page used to be utf-8 but looks like they reverted back to iso-8859-8 encoding = 'iso_8859_8' logger.info('getting index page html from %s' % url) try: return unicode(urllib2.urlopen(url).read(), encoding) except: logger.exception(u'could not fetch committees_index_page for url %s' % url) send_chat_notification(__name__, "could not fetch committees index page", {'url': url}) return ''
def _get_committees_index_page(full): if full: url = FULL_URL encoding = 'iso_8859_8' else: url = URL # encoding='utf8' # the encoding of this page used to be utf-8 but looks like they reverted back to iso-8859-8 encoding = 'iso_8859_8' logger.info('getting index page html from %s' % url) try: return unicode(urllib2.urlopen(url).read(), encoding) except: logger.exception(u'could not fetch committees_index_page for url %s' % url) send_chat_notification(__name__, "could not fetch committees index page", {'url': url}) return ''
def get_page_with_param(self,params): logger.debug('get_page_with_param: self.url=%s, params=%s' % (self.url, params)) if params == None: try: html_page = urllib2.urlopen(self.url).read().decode('windows-1255').encode('utf-8') except urllib2.URLError: logger.error("can't open URL: %s" % self.url) send_chat_notification(__name__, 'failed to open url', {'url': self.url, 'params': None}) return None try: soup = BeautifulSoup(html_page) except HTMLParseError, e: logger.debug("parsing URL: %s - %s. will try harder." % (self.url, e)) html_page = re.sub("(?s)<!--.*?-->"," ", html_page) # cut anything that looks suspicious html_page = re.sub("(?s)<script>.*?</script>"," ", html_page) html_page = re.sub("(?s)<!.*?>"," ", html_page) try: soup = BeautifulSoup(html_page) except HTMLParseError, e: logger.debug("error parsing URL: %s - %s" % (self.url, e)) send_chat_notification(__name__, 'failed to parse url', {'url': self.url, 'params': None}) return None
def get_page_with_param(self, params): logger.debug('get_page_with_param: self.url=%s, params=%s' % (self.url, params)) if not params: try: html_page = urllib2.urlopen(self.url).read().decode('windows-1255').encode('utf-8') except urllib2.URLError as e: logger.error("can't open URL: %s" % self.url) send_chat_notification(__name__, 'failed to open url', {'url': self.url, 'params': params}) return None try: soup = BeautifulSoup(html_page) except HTMLParseError as e: logger.debug("parsing URL: %s - %s. will try harder." % (self.url, e)) html_page = re.sub("(?s)<!--.*?-->", " ", html_page) # cut anything that looks suspicious html_page = re.sub("(?s)<script>.*?</script>", " ", html_page) html_page = re.sub("(?s)<!.*?>", " ", html_page) try: soup = BeautifulSoup(html_page) except HTMLParseError as e: logger.debug("error parsing URL: %s - %s" % (self.url, e)) send_chat_notification(__name__, 'failed to parse url', {'url': self.url, 'params': None}) return None comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] return soup else: data = urllib.urlencode(params) try: url_data = urllib2.urlopen(self.url, data) except urllib2.URLError: logger.error("can't open URL: %s" % self.url) send_chat_notification(__name__, 'failed to open url', {'url': self.url, 'params': data}) return None html_page = url_data.read().decode('windows-1255').encode('utf-8') try: soup = BeautifulSoup(html_page) except HTMLParseError as e: logger.debug("error parsing URL: %s - %s" % (self.url, e)) send_chat_notification(__name__, 'failed to parse url', {'url': self.url, 'params': data}) return None comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] return soup
def get_page_with_param(self, params): logger.debug('get_page_with_param: self.url=%s, params=%s' % (self.url, params)) if not params: try: html_page = urllib2.urlopen( self.url).read().decode('windows-1255').encode('utf-8') except urllib2.URLError as e: logger.error("can't open URL: %s" % self.url) send_chat_notification(__name__, 'failed to open url', { 'url': self.url, 'params': params }) return None try: soup = BeautifulSoup(html_page) except HTMLParseError as e: logger.debug("parsing URL: %s - %s. will try harder." % (self.url, e)) html_page = re.sub( "(?s)<!--.*?-->", " ", html_page) # cut anything that looks suspicious html_page = re.sub("(?s)<script>.*?</script>", " ", html_page) html_page = re.sub("(?s)<!.*?>", " ", html_page) try: soup = BeautifulSoup(html_page) except HTMLParseError as e: logger.debug("error parsing URL: %s - %s" % (self.url, e)) send_chat_notification(__name__, 'failed to parse url', { 'url': self.url, 'params': None }) return None comments = soup.findAll( text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] return soup else: data = urllib.urlencode(params) try: url_data = urllib2.urlopen(self.url, data) except urllib2.URLError: logger.error("can't open URL: %s" % self.url) send_chat_notification(__name__, 'failed to open url', { 'url': self.url, 'params': data }) return None html_page = url_data.read().decode('windows-1255').encode('utf-8') try: soup = BeautifulSoup(html_page) except HTMLParseError as e: logger.debug("error parsing URL: %s - %s" % (self.url, e)) send_chat_notification(__name__, 'failed to parse url', { 'url': self.url, 'params': data }) return None comments = soup.findAll( text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] return soup
html_page = re.sub("(?s)<script>.*?</script>"," ", html_page) html_page = re.sub("(?s)<!.*?>"," ", html_page) try: soup = BeautifulSoup(html_page) except HTMLParseError, e: logger.debug("error parsing URL: %s - %s" % (self.url, e)) send_chat_notification(__name__, 'failed to parse url', {'url': self.url, 'params': None}) return None return soup else: data = urllib.urlencode(params) try: url_data = urllib2.urlopen(self.url,data) except urllib2.URLError: logger.error("can't open URL: %s" % self.url) send_chat_notification(__name__, 'failed to open url', {'url': self.url, 'params': data}) return None html_page = url_data.read().decode('windows-1255').encode('utf-8') try: soup = BeautifulSoup(html_page) except HTMLParseError, e: logger.debug("error parsing URL: %s - %s" % (self.url, e)) send_chat_notification(__name__, 'failed to parse url', {'url': self.url, 'params': data}) return None return soup def fix_dash(s): """returns s with normalized spaces before and after the dash""" if not s: return None m = re.match(r'(תיקון)( ?)(-)( ?)(.*)'.decode('utf8'),s)
except HTMLParseError, e: logger.debug("error parsing URL: %s - %s" % (self.url, e)) send_chat_notification(__name__, 'failed to parse url', { 'url': self.url, 'params': None }) return None return soup else: data = urllib.urlencode(params) try: url_data = urllib2.urlopen(self.url, data) except urllib2.URLError: logger.error("can't open URL: %s" % self.url) send_chat_notification(__name__, 'failed to open url', { 'url': self.url, 'params': data }) return None html_page = url_data.read().decode('windows-1255').encode('utf-8') try: soup = BeautifulSoup(html_page) except HTMLParseError, e: logger.debug("error parsing URL: %s - %s" % (self.url, e)) send_chat_notification(__name__, 'failed to parse url', { 'url': self.url, 'params': data }) return None return soup