def get_page(self, id, num): """ Return page by ID and number """ https = HTTP_MAIN_PAGE + "?c={0}&page={1}".format(id, num) print_debug_text(DEBUG_PHRASES["get_page"] + https) p = pq(https) return p
def parse_main_topics(self): p = pq(HTTP_MAIN_PAGE) main_topics = p(SELECTS_PATH["main"]).children() print_debug_text("\nMain topics:") d = {} for i in main_topics: print_debug_text("\t" + i.attrib["value"] + " " + i.text) if i.attrib["value"].isdigit(): d[i.attrib["value"]] = i.text else: self.serialization(d, SAVE_NAMES["main_topics"]) print_warning_text("main topics has been update")
def get_posts(self, page): """ Parse all posts on the page Return [] """ print_debug_text(DEBUG_PHRASES["get_posts"] + page.base_url) items = page("div.wants-content > div").children() childrens = [] for i in items: if (i.attrib["class"].startswith("card want-card")): childrens.append( items("div." + i.attrib["class"].replace(" ", "."))) else: return childrens
def parse_sub_topics(self): p = pq(HTTP_MAIN_PAGE) sub_topics = p(SELECTS_PATH["sub"]).children() print_debug_text("\nSub topics:") d = {} for topic in sub_topics: for sub in topic: print_debug_text(topic.attrib["data-category-id"]) tmp_d = {} for s in sub[1:]: print_debug_text("\t" + s.text) tmp_d[s.attrib["value"]] = s.text else: d[topic.attrib["data-category-id"]] = tmp_d else: self.serialization(d, SAVE_NAMES["sub_topics"]) print_warning_text("sub topics has been update")
def deserialization(self, name): with open(os.path.join(DATA_DIR, name)) as f: return json.load(f) print_debug_text("\n" + DEBUG_PHRASES["deserl"] + os.path.join(DATA_DIR, name))
def serialization(self, data, name): with open(os.path.join(DATA_DIR, name), "w") as f: json.dump(data, f) print_debug_text("\n" + DEBUG_PHRASES["serl"] + os.path.join(DATA_DIR, name))
__all__ = ["browser", "core", "parser", "settings", "text_color"] from src.settings import DEBUG_PHRASES from src.text_color import print_debug_text print_debug_text(DEBUG_PHRASES['__init__'] + __name__)