def edit(self, text, title=None, date_of_pub=None): """ Edit concept. Args: text (str): New text of the context. title (str, default None): New title of the concept. If not set, old title is used. date_of_pub (str/int, default None): Date string in abclinuxu format or timestamp determining when the concept should be automatically published. Note: `date_of_pub` can be string in format ``"%Y-%m-%d %H:%M"``. """ if not self._meta: self._init_metadata() data = download( url_context(self._meta["Uprav zápis"]), session=self._session ) dom = dhtmlparser.parseString(data) form = dom.find("form", {"name": "form"}) assert form, "Can't find edit form!" form = first(form) form_action = form.params["action"] if title is None: title = first(form.find("input", {"name": "title"})) title = title.params["value"] date = "" if date_of_pub is None: date = first(form.find("input", {"name": "publish"})) date = date.params["value"] elif isinstance(date_of_pub, basestring): date = date_of_pub else: date = ts_to_concept_date(date_of_pub) data = download( url=url_context(form_action), method="POST", data={ "cid": 0, "publish": date, "content": text, "title": title, "delay": "Ulož", "action": "edit2" }, session=self._session ) check_error_div(data, '<div class="error" id="contentError">') check_error_page(data)
def edit(self, text, title=None, date_of_pub=None): """ Edit concept. Args: text (str): New text of the context. title (str, default None): New title of the concept. If not set, old title is used. date_of_pub (str/int, default None): Date string in abclinuxu format or timestamp determining when the concept should be automatically published. Note: `date_of_pub` can be string in format ``"%Y-%m-%d %H:%M"``. """ if not self._meta: self._init_metadata() data = download(url_context(self._meta["Uprav zápis"]), session=self._session) dom = dhtmlparser.parseString(data) form = dom.find("form", {"name": "form"}) assert form, "Can't find edit form!" form = first(form) form_action = form.params["action"] if title is None: title = first(form.find("input", {"name": "title"})) title = title.params["value"] date = "" if date_of_pub is None: date = first(form.find("input", {"name": "publish"})) date = date.params["value"] elif isinstance(date_of_pub, basestring): date = date_of_pub else: date = ts_to_concept_date(date_of_pub) data = download(url=url_context(form_action), method="POST", data={ "cid": 0, "publish": date, "content": text, "title": title, "delay": "Ulož", "action": "edit2" }, session=self._session) check_error_div(data, '<div class="error" id="contentError">') check_error_page(data)
def add_tag(self, tag): """ Add new tag to the blogpost. Args: tag (Tag): :class:`Tag` instance. See :class:`possible_tags` for list of all possible tags. Raises: KeyError: In case, that `tag` is not instance of :class:`Tag`. ValueError: In case that :attr:`uid` is not set. Returns: list: List of :class:`Tag` objects. """ if not isinstance(tag, Tag): raise KeyError( "Tag have instance of Tag and to be from .possible_tags()") if not self.uid: raise ValueError( "Can't assign tag - .uid property not set. Call .pull() or " "assign .uid manually.") tags_xml = download( url_context("/ajax/tags/assign?rid=%d&tagID=%s" % (self.uid, tag.norm))) self.tags = self.__class__._parse_tags(tags_xml) return self.tags
def add_pic(self, opened_file): """ Add picture to the Concept. Args: opened_file (file): opened file object """ # init meta if not self._meta: self._init_metadata() # get link to pic form data = download(url_context(self._meta["Přidej obrázek"]), session=self._session) dom = dhtmlparser.parseString(data) # get information from pic form form = first(dom.find("form", {"enctype": "multipart/form-data"})) add_pic_url = form.params["action"] # send pic data = self._session.post(url_context(add_pic_url), data={ "action": "addScreenshot2", "finish": "Nahrát" }, files={"screenshot": opened_file}) data = data.text.encode("utf-8") check_error_div(data, '<div class="error" id="screenshotError">')
def from_user_id(user_id): """ Transform `user_id` to instance of :class:`User`. Returns: obj: :class:`User` instance parsed from the `user_id`. """ data = shared.download(url_context("/Profile/" + str(user_id))) dom = dhtmlparser.parseString(data) dhtmlparser.makeDoubleLinked(dom) shared.handle_errors(dom) # <li><a href="/lide/unittest/objekty" rel="nofollow">Seznam příspěvků # na abclinuxu.cz</a> a_tags = dom.find( "a", fn=lambda x: x.params.get("href", "").startswith("/lide/") ) # pick only links which have content that starts with Seznam links = [ a_tag.params["href"] for a_tag in a_tags if a_tag.getContent().startswith("Seznam") ] username = links[-1].split("/")[2] return User(username)
def from_user_id(user_id): """ Transform `user_id` to instance of :class:`User`. Returns: obj: :class:`User` instance parsed from the `user_id`. """ data = shared.download(url_context("/Profile/" + str(user_id))) dom = dhtmlparser.parseString(data) dhtmlparser.makeDoubleLinked(dom) shared.handle_errors(dom) # <li><a href="/lide/unittest/objekty" rel="nofollow">Seznam příspěvků # na abclinuxu.cz</a> a_tags = dom.find( "a", fn=lambda x: x.params.get("href", "").startswith("/lide/")) # pick only links which have content that starts with Seznam links = [ a_tag.params["href"] for a_tag in a_tags if a_tag.getContent().startswith("Seznam") ] username = links[-1].split("/")[2] return User(username)
def iter_blogposts(start=0, end=None, lazy=True): """ Iterate over blogs. Based at bloglist. Args: start (int, default 0): Start at this page. end (int, default None): End at this page. lazy (bool, default True): Initialize :class:`.Blogpost` objects only with informations from listings. Don't download full text and comments. Yields: obj: :class:`.Blogpost` objects. """ for cnt, url in enumerate(_next_blog_url(start)): data = _shared.download(url) data = _remove_crap_from_bloglist(data) # parse basic info about all blogs at page dom = _dhtmlparser.parseString(data) for bcnt, blog in enumerate(dom.findB("div", {"class": "cl"})): yield Blogpost.from_html(blog, lazy=lazy) # every page has 25 blogposts, but somethimes I am getting more if bcnt >= 24: break # detect end of pagination at the bottom if not _should_continue(dom): break if end is not None and cnt >= end: break
def add_tag(self, tag): """ Add new tag to the blogpost. Args: tag (Tag): :class:`Tag` instance. See :class:`possible_tags` for list of all possible tags. Raises: KeyError: In case, that `tag` is not instance of :class:`Tag`. ValueError: In case that :attr:`uid` is not set. Returns: list: List of :class:`Tag` objects. """ if not isinstance(tag, Tag): raise KeyError( "Tag have instance of Tag and to be from .possible_tags()" ) if not self.uid: raise ValueError( "Can't assign tag - .uid property not set. Call .pull() or " "assign .uid manually." ) tags_xml = download(url_context( "/ajax/tags/assign?rid=%d&tagID=%s" % (self.uid, tag.norm) )) self.tags = self.__class__._parse_tags(tags_xml) return self.tags
def add_pic(self, opened_file): """ Add picture to the Concept. Args: opened_file (file): opened file object """ # init meta if not self._meta: self._init_metadata() # get link to pic form data = download( url_context(self._meta["Přidej obrázek"]), session=self._session ) dom = dhtmlparser.parseString(data) # get information from pic form form = first(dom.find("form", {"enctype": "multipart/form-data"})) add_pic_url = form.params["action"] # send pic data = self._session.post( url_context(add_pic_url), data={ "action": "addScreenshot2", "finish": "Nahrát" }, files={"screenshot": opened_file} ) data = data.text.encode("utf-8") check_error_div(data, '<div class="error" id="screenshotError">')
def test_end_of_bloglist(pagination): url = _shared.url_context("/blog/?from=%d" % (pagination * 25)) data = _shared.download(url) dom = _dhtmlparser.parseString(_remove_crap_from_bloglist(data)) if progress_fn: progress_fn(pagination) return _should_continue(dom)
def possible_tags(cls): """ Get list of all possible tags which may be set. Returns: list: List of :class:`Tag` objects. """ tags_xml = download(url_context("/ajax/tags/list")) return cls._parse_tags(tags_xml)
def _get(self, url, params=None, as_text=True): """ Shortcut for ``self.session.get().text.encode("utf-8")``. Args: url (str): Url on which the GET request will be sent. params (dict): GET parameters. as_text (bool, default True): Return result as text or binary data. Returns: str/binary data: depending on the `as_text` parameter. """ return shared.download(url=url, params=params, session=self.session, as_text=as_text)
def _init_metadata(self, data=None): if not data: data = download(self.link, session=self._session) if '<div class="s_nadpis">Správa zápisku</div>' not in data: raise ValueError( "Can't parse metadata! It looks like I am not logged in!") data = data.split('<div class="s_nadpis">Správa zápisku</div>')[1] dom = dhtmlparser.parseString(data) meta_list = first(dom.find("div", {"class": "s_sekce"})) self._meta = {} for li in meta_list.find("li"): a = first(li.find("a")) self._meta[a.getContent().strip()] = a.params["href"]
def iter_blogposts(start=0, end=None, lazy=True): """ Iterate over blogs. Based at bloglist. Args: start (int, default 0): Start at this page. end (int, default None): End at this page. lazy (bool, default True): Initialize :class:`.Blogpost` objects only with informations from listings. Don't download full text and comments. Yields: obj: :class:`.Blogpost` objects. """ for cnt, url in enumerate(_next_blog_url(start)): data = _shared.download(url) # clean crap, get just content data = data.split( '<div class="s_nadpis linkbox_nadpis">Píšeme jinde</div>' )[0] data = data.split('<div class="st" id="st">')[1] # some blogs have openning comment in perex, which f***s ups bloglist # - this will close comments that goes over bloglist data = data.replace( '<div class="signature">', '<!-- --><div class="signature">' ) # parse basic info about all blogs at page dom = _dhtmlparser.parseString(data) for bcnt, blog in enumerate(dom.findB("div", {"class": "cl"})): yield Blogpost.from_html(blog, lazy=lazy) # every page has 25 blogposts, but somethimes I am getting more if bcnt >= 24: break # detect end of pagination at the bottom if not _should_continue(dom): break if end is not None and cnt >= end: break
def _get(self, url, params=None, as_text=True): """ Shortcut for ``self.session.get().text.encode("utf-8")``. Args: url (str): Url on which the GET request will be sent. params (dict): GET parameters. as_text (bool, default True): Return result as text or binary data. Returns: str/binary data: depending on the `as_text` parameter. """ return shared.download( url=url, params=params, session=self.session, as_text=as_text )
def _init_metadata(self, data=None): if not data: data = download(self.link, session=self._session) if '<div class="s_nadpis">Správa zápisku</div>' not in data: raise ValueError( "Can't parse metadata! It looks like I am not logged in!" ) data = data.split('<div class="s_nadpis">Správa zápisku</div>')[1] dom = dhtmlparser.parseString(data) meta_list = first(dom.find("div", {"class": "s_sekce"})) self._meta = {} for li in meta_list.find("li"): a = first(li.find("a")) self._meta[a.getContent().strip()] = a.params["href"]
def remove_tag(self, tag, throw=False): """ Remove tag from the tags currently assigned to blogpost. Args: tag (Tag): :class:`Tag` instance. See :class:`possible_tags` for list of all possible tags. throw (bool): Raise error in case you are trying to remove tag that is not assigned to blogpost. Raises: KeyError: In case, that `tag` is not instance of :class:`Tag`. IndexError: In case that you are trying to remove tag which is not assigned to blogpost. ValueError: In case that :attr:`uid` is not set. Returns: list: List of :class:`Tag` objects. """ if not isinstance(tag, Tag): raise KeyError( "Tag have instance of Tag and to be from .tags()" ) if tag not in self.tags: if not throw: return self.tags raise IndexError("Can't remove unassigned tag.") if not self.uid: raise ValueError( "Can't assign tag - .uid property not set. Call .pull() or " "assign .uid manually." ) tags_xml = download(url_context( "/ajax/tags/unassign?rid=%d&tagID=%s" % (self.uid, tag.norm) )) self.tags = self.__class__._parse_tags(tags_xml) return self.tags
def iter_blogposts(start=0, end=None, lazy=True): """ Iterate over blogs. Based at bloglist. Args: start (int, default 0): Start at this page. end (int, default None): End at this page. lazy (bool, default True): Initialize :class:`.Blogpost` objects only with informations from listings. Don't download full text and comments. Yields: obj: :class:`.Blogpost` objects. """ for cnt, url in enumerate(_next_blog_url(start)): data = _shared.download(url) # clean crap, get just content data = data.split( '<div class="s_nadpis linkbox_nadpis">Píšeme jinde</div>')[0] data = data.split('<div class="st" id="st">')[1] # some blogs have openning comment in perex, which f***s ups bloglist # - this will close comments that goes over bloglist data = data.replace('<div class="signature">', '<!-- --><div class="signature">') # parse basic info about all blogs at page dom = _dhtmlparser.parseString(data) for bcnt, blog in enumerate(dom.findB("div", {"class": "cl"})): yield Blogpost.from_html(blog, lazy=lazy) # every page has 25 blogposts, but somethimes I am getting more if bcnt >= 24: break # detect end of pagination at the bottom if not _should_continue(dom): break if end is not None and cnt >= end: break
def list_pics(self): """ Return: list: List of URLs to pictures used in this concept. """ # init meta if not self._meta: self._init_metadata() data = download(url_context(self._meta["Správa příloh"]), session=self._session) dom = dhtmlparser.parseString(data) form = dom.find("form", {"name": "form"}) assert form, "Can't find pic form!" return [ a.params["href"] for a in first(form).find("a") if "href" in a.params ]
def remove_tag(self, tag, throw=False): """ Remove tag from the tags currently assigned to blogpost. Args: tag (Tag): :class:`Tag` instance. See :class:`possible_tags` for list of all possible tags. throw (bool): Raise error in case you are trying to remove tag that is not assigned to blogpost. Raises: KeyError: In case, that `tag` is not instance of :class:`Tag`. IndexError: In case that you are trying to remove tag which is not assigned to blogpost. ValueError: In case that :attr:`uid` is not set. Returns: list: List of :class:`Tag` objects. """ if not isinstance(tag, Tag): raise KeyError("Tag have instance of Tag and to be from .tags()") if tag not in self.tags: if not throw: return self.tags raise IndexError("Can't remove unassigned tag.") if not self.uid: raise ValueError( "Can't assign tag - .uid property not set. Call .pull() or " "assign .uid manually.") tags_xml = download( url_context("/ajax/tags/unassign?rid=%d&tagID=%s" % (self.uid, tag.norm))) self.tags = self.__class__._parse_tags(tags_xml) return self.tags
def pull(self): """ Download page with blogpost. Parse text, comments and everything else. Until this is called, following attributes are not known/parsed: - :attr:`text` - :attr:`tags` - :attr:`has_tux` - :attr:`comments` - :attr:`last_modified_ts` """ data = download(url=self.url) # this is because of f***s who forgot to close elements like in this # blogpost: https://www.abclinuxu.cz/blog/EmentuX/2005/10/all-in-one blog_data, comments_data = data.split('<p class="page_tools">') self._dom = dhtmlparser.parseString(blog_data) self._content_tag = None dhtmlparser.makeDoubleLinked(self._dom) self._parse_uid() self._parse_title() self._parse_text() self._parse_rating() self._parse_meta() self._tags = self._get_tags() # there are blogs with f****d up HTML which is basically unparsable if self.relative_url not in COMMENT_BANLIST: self.comments = Comment.comments_from_html(comments_data) self.comments_n = len(self.comments) # memory cleanup - this saves a LOT of memory self._dom = None self._content_tag = None
def list_pics(self): """ Return: list: List of URLs to pictures used in this concept. """ # init meta if not self._meta: self._init_metadata() data = download( url_context(self._meta["Správa příloh"]), session=self._session ) dom = dhtmlparser.parseString(data) form = dom.find("form", {"name": "form"}) assert form, "Can't find pic form!" return [ a.params["href"] for a in first(form).find("a") if "href" in a.params ]
def get_content(self): """ Get content of this Concept. Returns: str: full HTML UTF-8 encoded text of the concept. """ data = download(self.link, session=self._session) if not self._meta: self._init_metadata(data) data = first(data.rsplit('<!-- -->', 1)) # find beginning of the concept text dom = dhtmlparser.parseString(data) meta_vypis = dom.find("p", {"class": "meta-vypis"}) if not meta_vypis: raise ValueError("Can't find meta-vypis <p>!") meta_vypis = first(meta_vypis) data = data.split(str(meta_vypis))[1] return data.strip()
def _get_tags(self): # parse tags tags_url = "/ajax/tags/assigned?rid=%d" % self.uid tags_xml = download(url_context(tags_url)) return self.__class__._parse_tags(tags_xml)
def get(url, name, match): print('Downloading {}'.format(url)) # if not os.path.exists(os.path.join(DOWNLOAD_DIR, os.path.basename(url))): file = download(url, DOWNLOAD_DIR) print('Extracting {}'.format(name)) unzip(file, os.path.join(EXTRACTED_DIR, name), match)