def _parse_url(head_tag): comment_id = head_tag.params["id"] # parse full link from # <a href="/blog/EditDiscussion/400959;jsessionid=kufis2spplnh6gu671mxq # e2j?action=add&dizId=210591&threadId=9">Odpovědět</a> response_tag = head_tag.find( "a", fn=lambda x: x.getContent() == "Odpovědět" ) try: response_link = first(response_tag).params["href"] except StopIteration: return None # /blog/EditDiscussion/400959;jsessii... -> /blog/EditDiscussion/400959 response_link = response_link.split(";")[0] # /blog/EditDiscussion/400959?action=a.. -> /blog/EditDiscussion/400959 response_link = response_link.split("?")[0] # /blog/EditDiscussion/400959 -> 400959 blog_id = first( token for token in response_link.split("/") if token.isdigit() ) return url_context("/blog/show/%s#%s" % (blog_id, comment_id))
def _parse_meta(self): content = self._parse_content_tag() meta_vypis_tags = content.find("p", {"class": "meta-vypis"}) if not meta_vypis_tags: return meta_vypis_tag = first(meta_vypis_tags) has_tux_tags = meta_vypis_tag.find("img", {"class": "blog_digest"}) if has_tux_tags: self.has_tux = True # get clean string - another thing which is not semantic at all lines = dhtmlparser.removeTags(meta_vypis_tag) self.created_ts = parse_timestamp(lines) # rest will be picked one by one lines = lines.strip().splitlines() # parse last modification time modified_ts_line = [x for x in lines if "poslední úprava:" in x] if modified_ts_line: date_string = first(modified_ts_line).split(": ")[-1] self.last_modified_ts = parse_timestamp(date_string) # parse number of reads reads_line = [x for x in lines if "Přečteno:" in x] if reads_line: reads = first(reads_line).split(":")[-1].split("&")[0] self.readed = int(reads)
def _parse_text(self): content_tag = copy.deepcopy(self._parse_content_tag()) # this shit is not structured in tree, so the parsing is little bit # hard h2_tag = first(content_tag.find("h2") + content_tag.find("h1")) rating_tag = first(content_tag.find("div", {"class": "rating"})) # throw everything until the h2_tag h2_parent = h2_tag.parent while h2_parent.childs[0] != h2_tag: h2_parent.childs.pop(0) # throw everything after the rating_tag rating_parent = rating_tag.parent while rating_parent.childs[-1] != rating_tag: rating_parent.childs.pop() # throw also the rating rating_parent.childs.pop() meta_vypis_tag = content_tag.find("p", {"class": "meta-vypis"}) if meta_vypis_tag: content_tag.removeChild(meta_vypis_tag, end_tag_too=True) self.text = content_tag.getContent()
def edit(self, text, title=None, date_of_pub=None): """ Edit concept. Args: text (str): New text of the context. title (str, default None): New title of the concept. If not set, old title is used. date_of_pub (str/int, default None): Date string in abclinuxu format or timestamp determining when the concept should be automatically published. Note: `date_of_pub` can be string in format ``"%Y-%m-%d %H:%M"``. """ if not self._meta: self._init_metadata() data = download( url_context(self._meta["Uprav zápis"]), session=self._session ) dom = dhtmlparser.parseString(data) form = dom.find("form", {"name": "form"}) assert form, "Can't find edit form!" form = first(form) form_action = form.params["action"] if title is None: title = first(form.find("input", {"name": "title"})) title = title.params["value"] date = "" if date_of_pub is None: date = first(form.find("input", {"name": "publish"})) date = date.params["value"] elif isinstance(date_of_pub, basestring): date = date_of_pub else: date = ts_to_concept_date(date_of_pub) data = download( url=url_context(form_action), method="POST", data={ "cid": 0, "publish": date, "content": text, "title": title, "delay": "Ulož", "action": "edit2" }, session=self._session ) check_error_div(data, '<div class="error" id="contentError">') check_error_page(data)
def edit(self, text, title=None, date_of_pub=None): """ Edit concept. Args: text (str): New text of the context. title (str, default None): New title of the concept. If not set, old title is used. date_of_pub (str/int, default None): Date string in abclinuxu format or timestamp determining when the concept should be automatically published. Note: `date_of_pub` can be string in format ``"%Y-%m-%d %H:%M"``. """ if not self._meta: self._init_metadata() data = download(url_context(self._meta["Uprav zápis"]), session=self._session) dom = dhtmlparser.parseString(data) form = dom.find("form", {"name": "form"}) assert form, "Can't find edit form!" form = first(form) form_action = form.params["action"] if title is None: title = first(form.find("input", {"name": "title"})) title = title.params["value"] date = "" if date_of_pub is None: date = first(form.find("input", {"name": "publish"})) date = date.params["value"] elif isinstance(date_of_pub, basestring): date = date_of_pub else: date = ts_to_concept_date(date_of_pub) data = download(url=url_context(form_action), method="POST", data={ "cid": 0, "publish": date, "content": text, "title": title, "delay": "Ulož", "action": "edit2" }, session=self._session) check_error_div(data, '<div class="error" id="contentError">') check_error_page(data)
def add_pic(self, opened_file): """ Add picture to the Concept. Args: opened_file (file): opened file object """ # init meta if not self._meta: self._init_metadata() # get link to pic form data = download(url_context(self._meta["Přidej obrázek"]), session=self._session) dom = dhtmlparser.parseString(data) # get information from pic form form = first(dom.find("form", {"enctype": "multipart/form-data"})) add_pic_url = form.params["action"] # send pic data = self._session.post(url_context(add_pic_url), data={ "action": "addScreenshot2", "finish": "Nahrát" }, files={"screenshot": opened_file}) data = data.text.encode("utf-8") check_error_div(data, '<div class="error" id="screenshotError">')
def cut_dom_to_area_of_interest(html): dom = html # make sure, that you don't modify `html` parameter if not isinstance(html, dhtmlparser.HTMLElement): dom = dhtmlparser.parseString(html) else: dom = copy.deepcopy(dom) dhtmlparser.makeDoubleLinked(dom) # comments are not stored in hierarchical structure, but in somehow # flat-nested lists # locate end of article ds_toolbox = dom.find("div", {"class": "ds_toolbox"}) if not ds_toolbox: raise ValueError("Couldn't locate ds_toolbox!") ds_toolbox = first(ds_toolbox) dom = ds_toolbox.parent # ged rid of everything until end of the article while dom.childs[0] != ds_toolbox: dom.childs.pop(0) dom.childs.pop(0) return dom
def add_pic(self, opened_file): """ Add picture to the Concept. Args: opened_file (file): opened file object """ # init meta if not self._meta: self._init_metadata() # get link to pic form data = download( url_context(self._meta["Přidej obrázek"]), session=self._session ) dom = dhtmlparser.parseString(data) # get information from pic form form = first(dom.find("form", {"enctype": "multipart/form-data"})) add_pic_url = form.params["action"] # send pic data = self._session.post( url_context(add_pic_url), data={ "action": "addScreenshot2", "finish": "Nahrát" }, files={"screenshot": opened_file} ) data = data.text.encode("utf-8") check_error_div(data, '<div class="error" id="screenshotError">')
def _get_user_id(self): """ Resolve user's ID number for logged user. Returns: str: USER id as string. """ if self._user_id is not None: return self._user_id self.login() dom = dhtmlparser.parseString(self._get(ABCLINUXU_URL)) # resolve user's navigation panel nav_bar = dom.match( ["div", { "class": "hl_vpravo" }], { "tag_name": "a", "fn": lambda x: x.params.get("href", "").startswith("/Profile") }) if not nav_bar: raise ValueError("Can't parse user's navigation bar!") profile_link = first(nav_bar).params["href"] # transform /Profile/24642?action=myPage -> 24642 self._user_id = profile_link.split("?")[0].split("/")[-1] return self._user_id
def _get_user_id(self): """ Resolve user's ID number for logged user. Returns: str: USER id as string. """ if self._user_id is not None: return self._user_id self.login() dom = dhtmlparser.parseString(self._get(ABCLINUXU_URL)) # resolve user's navigation panel nav_bar = dom.match( ["div", {"class": "hl_vpravo"}], { "tag_name": "a", "fn": lambda x: x.params.get("href", "").startswith("/Profile") } ) if not nav_bar: raise ValueError("Can't parse user's navigation bar!") profile_link = first(nav_bar).params["href"] # transform /Profile/24642?action=myPage -> 24642 self._user_id = profile_link.split("?")[0].split("/")[-1] return self._user_id
def _init_metadata(self, data=None): if not data: data = download(self.link, session=self._session) if '<div class="s_nadpis">Správa zápisku</div>' not in data: raise ValueError( "Can't parse metadata! It looks like I am not logged in!") data = data.split('<div class="s_nadpis">Správa zápisku</div>')[1] dom = dhtmlparser.parseString(data) meta_list = first(dom.find("div", {"class": "s_sekce"})) self._meta = {} for li in meta_list.find("li"): a = first(li.find("a")) self._meta[a.getContent().strip()] = a.params["href"]
def _init_metadata(self, data=None): if not data: data = download(self.link, session=self._session) if '<div class="s_nadpis">Správa zápisku</div>' not in data: raise ValueError( "Can't parse metadata! It looks like I am not logged in!" ) data = data.split('<div class="s_nadpis">Správa zápisku</div>')[1] dom = dhtmlparser.parseString(data) meta_list = first(dom.find("div", {"class": "s_sekce"})) self._meta = {} for li in meta_list.find("li"): a = first(li.find("a")) self._meta[a.getContent().strip()] = a.params["href"]
def _parse_title(self): assert self._dom title_tag = self._dom.find("title") if not title_tag: return self.title = first(title_tag).getContent()
def from_html(html, lazy=True): """ Convert HTML string to :class:`Blogpost` instance. Args: html (str): Input data. lazy (bool, default True): Be lazy (don't pull data by yourself from the site). Call :meth:`pull` for active download of all required informations. Returns: obj: :class:`Blogpost` instance. """ if not isinstance(html, dhtmlparser.HTMLElement): html = dhtmlparser.parseString(html) dhtmlparser.makeDoubleLinked(html) # support for legacy blogs title_tag = html.find("h2", {"class": "st_nadpis"}) if title_tag: title_tag = first(title_tag) rel_link = first(title_tag.find("a")).params["href"] link = url_context(rel_link) else: title_tag = first(html.find("h2")) link = first(html.find("link", {"rel": "canonical"})) link = link.params["href"] title = dhtmlparser.removeTags(title_tag).strip() # get meta meta = html.find("p", {"class": "meta-vypis"})[0] blog = Blogpost(url=link, lazy=lazy) if lazy: blog.title = title blog.intro = Blogpost._parse_intro(html, meta, title_tag) blog.rating = Blogpost._parse_rating_from_preview(meta) blog.created_ts = parse_timestamp(meta) blog.comments_n = Blogpost._parse_comments_n(meta) return blog
def _parse_text(body_tag): censored = False text_tag = body_tag.find("div", {"class": "ds_text"}) if not text_tag: censored = True text_tag = body_tag.find("div", {"class": "cenzura"}) if not text_tag: raise ValueError("Can't find comment body!") return first(text_tag).getContent().strip(), censored
def _parse_rating(self): content = self._parse_content_tag() rating_tags = content.find("div", {"class": "rating"}) if not rating_tags: return # <span> with voting info voting_spans = first(rating_tags).find("span") if not voting_spans: return voting_span = first(voting_spans) rating = voting_span.getContent() base = voting_span.params.get("title", "0") self.rating = Rating( rating=int(rating.split()[0]), base=int(base.split()[-1]), )
def _response_to(head_tag): response_to_tag = head_tag.find( "a", fn=lambda x: x.getContent() == "Výše" ) if not response_to_tag: return None # <a href="#2" title="...">Výše</a> -> #2 response_to_link = first(response_to_tag).params["href"] # #2 -> 2 return response_to_link.split("#")[-1]
def _izolate_username(head_tag): user_tag = head_tag.find( "a", fn=lambda x: x.params.get("href", "").startswith("/lide/") ) if user_tag: user_link = first(user_tag).params["href"] # /lide/manasekp -> manasekp real_username = user_link.split("/")[2] return real_username, True # registered # parse unregistered username from unstructured HTML like: # 10.2. 21:53 # # Tomáškova máma str_repr = dhtmlparser.removeTags(head_tag.getContent()) # remove blank lines lines = [x.strip() for x in str_repr.splitlines() if x.strip()] # izolate line with time line_with_time = first(date_izolator(lines)) # pick line next to line with time username = lines[lines.index(line_with_time) + 1] def clean_username(username): if username == "Rozbalit": # no username was found return "" return username.strip() return clean_username(username), False # unregistered
def get_content(self): """ Get content of this Concept. Returns: str: full HTML UTF-8 encoded text of the concept. """ data = download(self.link, session=self._session) if not self._meta: self._init_metadata(data) data = first(data.rsplit('<!-- -->', 1)) # find beginning of the concept text dom = dhtmlparser.parseString(data) meta_vypis = dom.find("p", {"class": "meta-vypis"}) if not meta_vypis: raise ValueError("Can't find meta-vypis <p>!") meta_vypis = first(meta_vypis) data = data.split(str(meta_vypis))[1] return data.strip()
def _parse_content_tag(self): assert self._dom if self._content_tag: return self._content_tag content_tags = self._dom.find("div", {"class": "st", "id": "st"}) if not content_tags: raise ValueError("Can't find content - is this really blogpost?") self._content_tag = first(content_tags) if not self._content_tag.isOpeningTag(): self._content_tag = self._content_tag.parent return self._content_tag
def cut_dom_to_area_of_interest(html): """ Raises: StopIteration: In case of no comments. ValueError: In case that there is missing elements from HTML. """ dom = html # make sure, that you don't modify `html` parameter if not isinstance(html, dhtmlparser.HTMLElement): dom = dhtmlparser.parseString(html) else: dom = copy.deepcopy(dom) dhtmlparser.makeDoubleLinked(dom) # comments are not stored in hierarchical structure, but in somehow # flat-nested lists # locate end of article ds_toolbox = dom.find("div", {"class": "ds_toolbox"}) if not ds_toolbox: # blogposts without any comments add_first_comment = dom.find( "a", fn=lambda x: "action=addDiz" in x.params.get("href", "") and x.getContent().strip() == "Vložit první komentář" ) if add_first_comment: raise StopIteration("No comments yet.") raise ValueError("Couldn't locate ds_toolbox!") ds_toolbox = first(ds_toolbox) dom = ds_toolbox.parent # ged rid of everything until end of the article while dom.childs[0] != ds_toolbox: dom.childs.pop(0) dom.childs.pop(0) return dom
def list_pics(self): """ Return: list: List of URLs to pictures used in this concept. """ # init meta if not self._meta: self._init_metadata() data = download(url_context(self._meta["Správa příloh"]), session=self._session) dom = dhtmlparser.parseString(data) form = dom.find("form", {"name": "form"}) assert form, "Can't find pic form!" return [ a.params["href"] for a in first(form).find("a") if "href" in a.params ]
def register_blog(self, blog_name): """ Register blog under `blog_name`. Users doesn't have blogs automatically, you have to create them manually. Raises: UserWarning: If user already have blog registered. ValueError: If it is not possible to register blog for user (see \ exception message for details). """ if self.has_blog: raise UserWarning("User already have blog!") add_blog_url = urljoin( ABCLINUXU_URL, urljoin("/blog/edit/", self._get_user_id()) ) data = self.session.post( add_blog_url, params={ "blogName": blog_name, "category1": "", "category2": "", "category3": "", "action": "addBlog2", }, verify=False, ) # check for errors dom = dhtmlparser.parseString(data.text.encode("utf-8")) errors = dom.find("p", {"class": "error"}) if errors: raise ValueError(first(errors).getContent()) self.blog_url = self._parse_blogname() if not self.has_blog: raise ValueError("Couldn't register new blog.")
def register_blog(self, blog_name): """ Register blog under `blog_name`. Users doesn't have blogs automatically, you have to create them manually. Raises: UserWarning: If user already have blog registered. ValueError: If it is not possible to register blog for user (see \ exception message for details). """ if self.has_blog: raise UserWarning("User already have blog!") add_blog_url = urljoin(ABCLINUXU_URL, urljoin("/blog/edit/", self._get_user_id())) data = self.session.post( add_blog_url, params={ "blogName": blog_name, "category1": "", "category2": "", "category3": "", "action": "addBlog2", }, verify=False, ) # check for errors dom = dhtmlparser.parseString(data.text.encode("utf-8")) errors = dom.find("p", {"class": "error"}) if errors: raise ValueError(first(errors).getContent()) self.blog_url = self._parse_blogname() if not self.has_blog: raise ValueError("Couldn't register new blog.")
def list_pics(self): """ Return: list: List of URLs to pictures used in this concept. """ # init meta if not self._meta: self._init_metadata() data = download( url_context(self._meta["Správa příloh"]), session=self._session ) dom = dhtmlparser.parseString(data) form = dom.find("form", {"name": "form"}) assert form, "Can't find pic form!" return [ a.params["href"] for a in first(form).find("a") if "href" in a.params ]