def parse(self, soup_obj): assert soup_obj is not None tr_list = soup_obj.select("table.torrents tr") ids = [] cnt = 0 for tr in tr_list: cnt += 1 if cnt == 1: # skip the caption tr continue seed = SeedInfo() td_list = tr.select("td.rowfollow") if len(td_list) < 9: # skip embedded contents continue url = tr.select( "table.torrentname tr td:nth-of-type(1) a")[0]['href'] id = self.parse_id(url) ids.append(id) return ids
def parse_page(self, soup_obj): items = soup_obj.select("item") assert len(items) != 0 seeds = [] for item in items: try: info = HttpUtils.get_content(item, "title").split("[") seed = SeedInfo() seed.title = info[0].strip() seed.size = HttpUtils.pretty_format(info[1].split("]")[0], "MB") seed.url = HttpUtils.get_attr(item, "enclosure", "url") seed.id = self.parse_id(seed.url) #Cache().set(seed.id, str(seed)) seeds.append(seed) except Exception as e: print(e.getMessage()) return seeds
def parse_page(self, soup_obj): items = soup_obj.select("item") assert len(items) != 0 seeds = [] for item in items: try: info = HttpUtils.get_content(item, "title").split("[") seed = SeedInfo() seed.title = info[0].strip() seed.size = HttpUtils.pretty_format(info[1].split(" ")[-2] + info[1].split(" ")[-1], "MB") # seed.url = HttpUtils.get_content(item, "link") seed.url = item.contents[4] seed.id = self.parse_id(seed.url) seeds.append(seed) except Exception as e: pass return seeds
def parse(self, soup_obj): assert soup_obj is not None tr_list = soup_obj.select("table.torrents tr") seeds = [] cnt = 0 for tr in tr_list: cnt += 1 if cnt == 1: # skip the caption tr continue seed = SeedInfo() td_list = tr.select("td.rowfollow") if len(td_list) < 9: # skip embedded contents continue seed.since = HttpUtils.get_content(td_list[2], "span") seed.size = float(self.parse_size(td_list[3])) seed.upload_num = int(self.clean_tag(td_list[4])) seed.download_num = int(self.clean_tag(td_list[5])) seed.finish_num = int(self.clean_tag(td_list[6])) seed.done = self.clean_tag(td_list[7]) seed.working = "peer-active" in td_list[7]['class'] td_title = tr.select("td.torrenttr tr td") seed.sticky = len(td_title[0].select("img[alt=\"Sticky\"]")) seed.title = td_title[0].select("a")[0]["title"] seed.url = td_title[0].select("a")[0]['href'] seed.free = len(td_title[0].select("img[alt=\"Free\"]")) > 0 seed.hot = len(td_title[0].select("font.hot")) > 0 if len(td_title[0].select("img[alt=\"50%\"]")) > 0: seed.discount = 50 elif len(td_title[0].select("img[alt=\"30%\"]")) > 0: seed.discount = 30 elif seed.free: seed.discount = 0 else: seed.discount = 100 seed.id = self.parse_id(seed.url) seeds.append(seed) print("Crawl: " + str(len(seeds))) if len(seeds) < 10: EmailSender.send(u"无法解析页面", Config.get("mteam_username")) return seeds
def parse_page(self, soup_obj): tr_list = soup_obj.select("table.torrents tr") seeds = [] cnt = 0 for tr in tr_list: cnt += 1 if cnt == 1: # skip the caption tr continue seed = SeedInfo() td_list = tr.select("td.rowfollow") if len(td_list) < 9: # skip embedded contents continue seed.sticky = len( td_list[1].select("table td img[alt=\"Sticky\"]")) seed.title = td_list[1].select("table td a")[0]["title"] seed.url = td_list[1].select("table td a")[0]['href'] seed.free = len(td_list[1].select("table font.free")) > 0 seed.hot = len(td_list[1].select("table font.hot")) > 0 seed.since = HttpUtils.get_content(td_list[3], "span") seed.size = float(self.parse_size(td_list[4])) seed.upload_num = int(self.clean_tag(td_list[5])) seed.download_num = int(self.clean_tag(td_list[6])) seed.finish_num = int(self.clean_tag(td_list[7])) seed.id = self.parse_id(seed.url) # parse discount if len(td_list[1].select("table td font.halfdown")) > 0: seed.discount = 50 elif len(td_list[1].select("table td font.d30down")) > 0: seed.discount = 30 else: seed.discount = 100 seeds.append(seed) return seeds