def gettypehash(url): rs = utils.gethtml(url) soup = BeautifulSoup(rs, "lxml") # parse main page info listitems = soup.select('#list_categories_categories_list_items > a') listinfos = [{'type': item['title'], 'url': item['href']} for item in listitems] # save json data utils.savejson(alljsonpath, listinfos)
def getpagecontent(ct): ctt = [] for i in ct: u = i[7] if u: content = utils.gethtml(u) rs = contentfilter(u, i[0], content) if rs and len(rs) == 6: ctt.append(rs) return ctt
def test_category(self): kw = "big bang" queryurl = "http://www.btyunsou.co/search?kw=" + kw html = gethtml(url=queryurl, outhtml=os.path.join(self.outpath, kw + '.html')) page = pq(html) clist = [ a.attr('href').split('_')[1] for a in page('div.sort li a').items() ] # btyunsou is quite simple, three categories self.assertEqual(Counter(clist), Counter(['ctime', 'length', 'click']))
def getvalidpage(kw, category, index): # return html pattern = {"kw": kw, "category": category, "index": index} kwpage = "{kw}_{category}_{index}.html".format(**pattern) qurl = ''.join([SCHEMA, DOMAIN, '/search/', kwpage]) logger.info('qurl is {}'.format(qurl)) rsp = gethtml(url=qurl, outhtml=os.path.join(STOREDPATH, kwpage)) html = pq(rsp) if html('div.media-body').html() is None: logger.error('No result for {}'.format(kw)) return None else: return html
def test_entryurl(self): html = gethtml(url=self.url, outhtml=os.path.join(self.outpath, 'btmain.html')) page = pq(html) form = page('form') # should have the form element self.assertIsNotNone(form.text()) method = form.attr('method') action = form.attr('action') # method should be 'get' self.assertEqual(method, 'get') # action should be '/search' self.assertEqual(action, '/search')
def getentrypage(kw): formdata = {"keyword": kw} kwpage = "{0}_Relevance_1.html".format(kw) rsp = gethtml(url=BTURL, outhtml=os.path.join(STOREDPATH, kwpage), method='POST', data=formdata, proxies=proxies) html = pq(rsp) if html('dl.list-con').html() is None: logger.error('No result for {}'.format(kw)) return None else: return html
def fetchresult(url): target = BTURL + url rsp = gethtml(url=target, outhtml=os.path.join(STOREDPATH, url.split('/')[2]), proxies=proxies) html = pq(rsp) dlink = html('dd.magnet a').text() #FIXME: ever url is valid, the magnet uri may be none... WTF~! if dlink: logger.info(dlink) else: logger.info("no result for {}.".format(target)) time.sleep(randint(1, 5))
def getleavelsinfo(): rootpath = './infolist' lpath = './leaveinfo' lists = os.listdir(path = rootpath) for i in range(len(lists)): p = os.path.join(rootpath, lists[i]) if p.find('all') == -1 and os.path.splitext(p)[1] == '.json': data = utils.getjsondata(p) for item in data: if item['url']: levlestr = utils.gethtml(item['url']) vurl = getleavesinfo(levlestr) print(item['title'], vurl) if vurl: item['vurl'] = vurl utils.savejson(os.path.join(lpath, lists[i]), data) print(os.path.join(lpath, lists[i]), ' done.')
def savesubinfo(): ''' sub page info ''' data = utils.getjsondata(alljsonpath) vinfo = [] for i in data: purl = i['pageurl'] vtype = i['type'] # initial page num to 1 pagenum = 1 if purl: while True: tpurl = purl.replace('pagenum', str(pagenum)) subrs = utils.gethtml(tpurl) slists = getsubinfo(subrs) if len(slists) == 0: break vinfo.extend(slists) pagenum += 1 utils.savejson('./infolist/{vtype}.json'.format(vtype = vtype), vinfo) print('共{count}条'.format(count=len(vinfo))) vinfo = []
def go(self): returnTitle = self["myMenu"].l.getCurrentSelection()[0] returnValue = self["myMenu"].l.getCurrentSelection()[1] returnIndex = self["myMenu"].getSelectedIndex() if not self.theFunc == "host": try: self.historyList[int(self.historyInt)] = [self.theFunc, self.osdList, returnIndex] except: self.historyList.append([self.theFunc, self.osdList, returnIndex]) self.historyInt = self.historyInt + 1 if self.theFunc == "main": print ">>>>>>>>>>>>main" print self.theFunc if not returnValue == "about": self.mainobj = returnValue self["myMenu"].setList(returnValue.osdList) self["myText"].setText(self.mainobj.description) self.theFunc = "genres" else: self.askForWord(self.about_text) elif self.theFunc == "genres": if not returnValue == "about": print ">>>>>>>>>>>>genres" print self.theFunc url = returnValue html = utils.gethtml(url) first_page = [returnValue] print url pages = self.mainobj.getPages(html) if pages == None: return pages = first_page + pages videos = [] if pages: for page in pages: url = page print "URL >>>>>>>>>: "+url html = utils.gethtml(url) vids_tmp = self.mainobj.getVideos(html) if vids_tmp: videos = videos + vids_tmp if not videos: return self.osdList = [(x[0],x[1],x[2]) for x in videos] if self.mainobj.to_sort: self.osdList.sort() self.lastVideosList = self.osdList self["myMenu"].setList(self.osdList) num_videos = len(videos) self["myText"].setText(self.mainobj.description + "\n\nSono presenti "+str(num_videos)+" film nella categoria "+returnTitle) self.theFunc = "movie" elif self.theFunc == "movie": print self.theFunc url = returnValue print url html = utils.gethtml(url) vklink = self.mainobj.getMovie(html, returnTitle) if vklink == None: return self["myText"].setText("\nTitolo Film: "+returnTitle+"\n"+"URL: "+url) self.osdList = [] if vklink: tmpindex = 0 for link in vklink: self.osdList.append((_(link[1] + " / " + str(tmpindex)), link[0])) tmpindex = tmpindex + 1 self["myMenu"].setList(self.osdList) self.theFunc = "host" else: print "#### NO VIDEO LINKS FOUND" print url text = str(self["myText"].getText()) + "\n\nSpiacente, nessun servizio di streaming trovato!" self["myText"].setText(text) print self.historyInt self.theFunc = "movie" elif self.theFunc == "show": pass elif self.theFunc == "episode": pass elif self.theFunc == "host": print ">>>>>>>>>>>>host" print self.theFunc print returnValue returnUrl = returnValue returnUrl = utils.getResolverURL(returnUrl) if returnUrl: fileRef = eServiceReference(4097,0,returnUrl) fileRef.setData(2,10240*1024) fileRef.setName(returnTitle) self.session.open(MoviePlayer, fileRef) else: return self["myMenu"].moveToIndex(0) print "HOST" print self.theFunc