def test_page_parser(self): """Makes sure we can parse pages correctly.""" args = util.parse_page('key: value\nkeys: one, two\n#ignore: me\n---\nhello, world.') self.assertEquals(3, len(args)) self.assertEquals(args.get('key'), 'value') self.assertEquals(args.get('keys'), ['one', 'two']) self.assertEquals(args.get('text'), 'hello, world.') # windows line endings args = util.parse_page('key: value\nkeys: one, two\n#ignore: me\r\n---\r\nhello, world.') self.assertEquals(3, len(args)) # old mac line endings args = util.parse_page('key: value\nkeys: one, two\n#ignore: me\r---\rhello, world.') self.assertEquals(3, len(args))
def can_read_page(title, user, is_admin): """Returns True if the user is allowed to read the specified page. Admins and global readers and editors are allowed to read all pages. Other users are allowed to read all pages if the wiki is open or if the user is listed in the readers/editors page property. Otherwise no access.""" if is_admin: return True is_user_reader = user and (user.email() in settings.get('readers', []) or user.email() in settings.get('editors', [])) if is_user_reader: return True page = model.WikiContent.get_by_title(title) options = util.parse_page(page.body or '') is_open_wiki = settings.get('open-reading', 'yes') == 'yes' if is_open_wiki: if options.get('private') != 'yes': return True return user and (user.email() in options.get('readers', []) or user.email() in options.get('editors', [])) elif settings.get('open-reading') == 'login': return options.get('public') == 'yes' or user else: return options.get('public') == 'yes'
def put(self): """Adds the gaewiki:parent: labels transparently.""" if self.body is not None: options = util.parse_page(self.body) self.redirect = options.get('redirect') self.pread = options.get( 'public') == 'yes' and options.get('private') != 'yes' self.labels = options.get('labels', []) if 'date' in options: try: self.created = datetime.datetime.strptime( options['date'], '%Y-%m-%d %H:%M:%S') except ValueError: pass if 'name' in options and options['name'] != self.title: if self.get_by_title(options['name'], create_if_none=False) is not None: raise ValueError('A page named "%s" already exists.' % options['name']) self.title = options['name'] self.__update_geopt() self.links = util.extract_links(self.body) self.add_implicit_labels() db.Model.put(self) settings.check_and_flush(self)
def fetch_offers_details(): links = util.read_json_from_file(output_filename) log.info('\nstarting offer fetch, %s to go', len(links)) succesful = 0 for item_chunk in util.chunks( [link for link in links if link['Fetched'] is False], 5): for i, item in enumerate(item_chunk): link = item['Link'] page = util.parse_page(link) try: data = extract_offer_details(page) item.update(data) succesful += 1 except: log.error('extracting details from page faild url=%s', link, exc_info=True) log.info('pages parsed succesfully %s, unsuccesfull %s', succesful, len(links) - succesful) log.info('pages parsed succesfully %s, unsuccesfull %s', succesful, len(links) - succesful) util.write_json_to_file(output_filename, links) log.info('pages parsed succesfully %s, unsuccesfull %s', succesful, len(links) - succesful) util.write_json_to_file(output_filename, links)
def get_chapters(): with util.connect() as conn: cur = conn.cursor() for url in util.links: _, data = cur.execute('SELECT * FROM dump WHERE url = ?', (url,)).fetchone() title, content = util.parse_page(data) yield url, title, content
def test_page_parser(self): """Makes sure we can parse pages correctly.""" args = util.parse_page( 'key: value\nkeys: one, two\n#ignore: me\n---\nhello, world.') self.assertEquals(3, len(args)) self.assertEquals(args.get('key'), 'value') self.assertEquals(args.get('keys'), ['one', 'two']) self.assertEquals(args.get('text'), 'hello, world.') # windows line endings args = util.parse_page( 'key: value\nkeys: one, two\n#ignore: me\r\n---\r\nhello, world.') self.assertEquals(3, len(args)) # old mac line endings args = util.parse_page( 'key: value\nkeys: one, two\n#ignore: me\r---\rhello, world.') self.assertEquals(3, len(args))
def get_all(): settings = memcache.get('gaewiki:settings') if settings is None: settings = util.parse_page(get_host_page().body) try: pytz.timezone(settings['timezone']) except pytz.UnknownTimeZoneError as e: logging.warning('Unknown timezone: %s, reset to UTC' % settings['timezone']) settings['timezone'] = 'UTC' memcache.set('gaewiki:settings', settings) return settings
def put(self): """Adds the gaewiki:parent: labels transparently.""" if self.body is not None: options = util.parse_page(self.body) self.redirect = options.get('redirect') self.pread = options.get('public') == 'yes' and options.get('private') != 'yes' self.labels = options.get('labels', []) if 'date' in options: try: self.created = datetime.datetime.strptime(options['date'], '%Y-%m-%d %H:%M:%S') except ValueError: pass if 'name' in options: self.title = options['name'] self.__update_geopt() self.add_implicit_labels() db.Model.put(self) settings.check_and_flush(self)
def put(self): """Adds the gaewiki:parent: labels transparently.""" if self.body is not None: options = util.parse_page(self.body) self.redirect = options.get('redirect') self.pread = options.get('public') == 'yes' and options.get('private') != 'yes' self.labels = options.get('labels', []) if 'date' in options: try: self.created = datetime.datetime.strptime(options['date'], '%Y-%m-%d %H:%M:%S') except ValueError: pass if 'name' in options and options['name'] != self.title: if self.get_by_title(options['name'], create_if_none=False) is not None: raise ValueError('A page named "%s" already exists.' % options['name']) self.title = options['name'] self.__update_geopt() self.links = util.extract_links(self.body) self.add_implicit_labels() db.Model.put(self) settings.check_and_flush(self)
def extract_offers_links(url=None, page=None, links=None): assert not (url is None and page is None), 'page or url required' if not page: page = util.parse_page(url) if not links: links = [] links_from_page = [ { 'Id': selection["data-ad-id"], 'Fetched': False, 'Link': selection["href"] } for selection in page.find_all("a", {"class": "offer-title__link"}) if not util.find(links, lambda x: x['Link'] == selection["href"]) ] log.info('links extracted %s', len(links_from_page)) if next_page_url := page.find('li', {"class": "next abs"}): next_page_url = next_page_url.find('a')['href'] log.info('next_page_url %s', next_page_url) return extract_offers_links(url=next_page_url, links=links_from_page + links)
def get_all(): help_page = memcache.get('gaewiki:syntax') if help_page is None: help_page = util.parse_page(get_page().body) memcache.set('gaewiki:syntax', help_page) return help_page['text']
def get_all(): settings = memcache.get('gaewiki:settings') if settings is None: settings = util.parse_page(get_host_page().body) memcache.set('gaewiki:settings', settings) return settings
def pretty(): url = request.args['url'] data, = get_db().execute('SELECT data FROM dump WHERE url = ?', (url,)).fetchone() title, content = util.parse_page(data) paragraphs = (p for p in content.split('\n') if p.strip()) return pretty_tmpl.render(title=title, paragraphs=paragraphs)