Python gethtml Exemples, utils.gethtml Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : spider.py Projet : schoeu/spid

def gettypehash(url):
    rs = utils.gethtml(url)
    soup = BeautifulSoup(rs, "lxml")
    # parse main page info
    listitems = soup.select('#list_categories_categories_list_items > a')
    listinfos = [{'type': item['title'], 'url': item['href']} for item in listitems]
    # save json data
    utils.savejson(alljsonpath, listinfos)

Exemple #2

0

Afficher le fichier

Fichier : capital.py Projet : schoeu/capital-spider

def getpagecontent(ct):
    ctt = []
    for i in ct:
        u = i[7]
        if u:
            content = utils.gethtml(u)
            rs = contentfilter(u, i[0], content)
            if rs and len(rs) == 6:
                ctt.append(rs)
    return ctt

Exemple #3

0

Afficher le fichier

Fichier : test_btyunso.py Projet : deevarvar/magneturl-crawler

 def test_category(self):
     kw = "big bang"
     queryurl = "http://www.btyunsou.co/search?kw=" + kw
     html = gethtml(url=queryurl,
                    outhtml=os.path.join(self.outpath, kw + '.html'))
     page = pq(html)
     clist = [
         a.attr('href').split('_')[1]
         for a in page('div.sort li a').items()
     ]
     # btyunsou is quite simple, three categories
     self.assertEqual(Counter(clist), Counter(['ctime', 'length', 'click']))

Exemple #4

0

Afficher le fichier

Fichier : btyunso.py Projet : deevarvar/magneturl-crawler

def getvalidpage(kw, category, index):
    # return html
    pattern = {"kw": kw, "category": category, "index": index}
    kwpage = "{kw}_{category}_{index}.html".format(**pattern)
    qurl = ''.join([SCHEMA, DOMAIN, '/search/', kwpage])
    logger.info('qurl is {}'.format(qurl))
    rsp = gethtml(url=qurl, outhtml=os.path.join(STOREDPATH, kwpage))
    html = pq(rsp)
    if html('div.media-body').html() is None:
        logger.error('No result for {}'.format(kw))
        return None
    else:
        return html

Exemple #5

0

Afficher le fichier

Fichier : test_btyunso.py Projet : deevarvar/magneturl-crawler

 def test_entryurl(self):
     html = gethtml(url=self.url,
                    outhtml=os.path.join(self.outpath, 'btmain.html'))
     page = pq(html)
     form = page('form')
     # should have the form element
     self.assertIsNotNone(form.text())
     method = form.attr('method')
     action = form.attr('action')
     # method should be 'get'
     self.assertEqual(method, 'get')
     # action should be '/search'
     self.assertEqual(action, '/search')

Exemple #6

0

Afficher le fichier

Fichier : cnbtkitty.py Projet : deevarvar/magneturl-crawler

def getentrypage(kw):
    formdata = {"keyword": kw}
    kwpage = "{0}_Relevance_1.html".format(kw)
    rsp = gethtml(url=BTURL,
                  outhtml=os.path.join(STOREDPATH, kwpage),
                  method='POST',
                  data=formdata,
                  proxies=proxies)
    html = pq(rsp)
    if html('dl.list-con').html() is None:
        logger.error('No result for {}'.format(kw))
        return None
    else:
        return html

Exemple #7

0

Afficher le fichier

Fichier : cnbtkitty.py Projet : deevarvar/magneturl-crawler

def fetchresult(url):
    target = BTURL + url
    rsp = gethtml(url=target,
                  outhtml=os.path.join(STOREDPATH,
                                       url.split('/')[2]),
                  proxies=proxies)
    html = pq(rsp)
    dlink = html('dd.magnet a').text()
    #FIXME: ever url is valid, the magnet uri may be none... WTF~!
    if dlink:
        logger.info(dlink)
    else:
        logger.info("no result for {}.".format(target))

    time.sleep(randint(1, 5))

Exemple #8

0

Afficher le fichier

Fichier : spider.py Projet : schoeu/spid

def getleavelsinfo():
    rootpath = './infolist'
    lpath = './leaveinfo'
    lists = os.listdir(path = rootpath)
    for i in range(len(lists)):
        p = os.path.join(rootpath, lists[i])
        if p.find('all') == -1 and os.path.splitext(p)[1] == '.json':
            data = utils.getjsondata(p)
            for item in data:
                if item['url']:
                    levlestr = utils.gethtml(item['url'])
                    vurl = getleavesinfo(levlestr)
                    print(item['title'], vurl)
                    if vurl:
                        item['vurl'] = vurl
            utils.savejson(os.path.join(lpath, lists[i]), data)
            print(os.path.join(lpath, lists[i]), ' done.')

Exemple #9

0

Afficher le fichier

Fichier : spider.py Projet : schoeu/spid

def savesubinfo():
    '''
        sub page info
    '''
    data = utils.getjsondata(alljsonpath)
    vinfo = []
    for i in data:
        purl = i['pageurl']
        vtype = i['type']
        # initial page num to 1
        pagenum = 1
        if purl:
            while True:
                tpurl = purl.replace('pagenum', str(pagenum))
                subrs = utils.gethtml(tpurl)
                slists = getsubinfo(subrs)
                if len(slists) == 0:
                    break
                vinfo.extend(slists)
                pagenum += 1

            utils.savejson('./infolist/{vtype}.json'.format(vtype = vtype), vinfo)
            print('共{count}条'.format(count=len(vinfo)))
            vinfo = []

Exemple #10

0

Afficher le fichier

Fichier : plugin.py Projet : ledzgio/e2-italiafilms

    def go(self):
        returnTitle = self["myMenu"].l.getCurrentSelection()[0]
        returnValue = self["myMenu"].l.getCurrentSelection()[1]
        returnIndex = self["myMenu"].getSelectedIndex()
        
        if not self.theFunc == "host":
            try:
                self.historyList[int(self.historyInt)] = [self.theFunc, self.osdList, returnIndex]
            except:    
                self.historyList.append([self.theFunc, self.osdList, returnIndex])
                
            self.historyInt = self.historyInt + 1
            
        if self.theFunc == "main":
            print ">>>>>>>>>>>>main"
            print self.theFunc
            if not returnValue == "about":
                self.mainobj = returnValue
                self["myMenu"].setList(returnValue.osdList)
                self["myText"].setText(self.mainobj.description)
                self.theFunc = "genres"
            else:
                self.askForWord(self.about_text)
        
        elif self.theFunc == "genres":
            if not returnValue == "about":
                print ">>>>>>>>>>>>genres"
                print self.theFunc
                url = returnValue
                html = utils.gethtml(url)
                first_page = [returnValue]
                print url
                pages = self.mainobj.getPages(html)
                if pages == None:
                    return
                pages = first_page + pages
                videos = []
                if pages:
                    for page in pages:
                        url = page
                        print "URL >>>>>>>>>: "+url
                        html = utils.gethtml(url)
                        vids_tmp = self.mainobj.getVideos(html)
                        if vids_tmp:
                            videos = videos + vids_tmp

                if not videos:
                    return
                
                self.osdList = [(x[0],x[1],x[2]) for x in videos]
                if self.mainobj.to_sort:
                    self.osdList.sort()
                self.lastVideosList = self.osdList
                self["myMenu"].setList(self.osdList)
                num_videos = len(videos)
                self["myText"].setText(self.mainobj.description + "\n\nSono presenti "+str(num_videos)+" film nella categoria "+returnTitle)
                self.theFunc = "movie"

        elif self.theFunc == "movie":
            print self.theFunc
            url = returnValue
            print url
            html = utils.gethtml(url)
            vklink = self.mainobj.getMovie(html, returnTitle)
            if vklink == None:
                return
            
            self["myText"].setText("\nTitolo Film: "+returnTitle+"\n"+"URL: "+url)
            self.osdList = []
            if vklink:
                tmpindex = 0
                for link in vklink:
                    self.osdList.append((_(link[1] + " / " + str(tmpindex)), link[0]))
                    tmpindex = tmpindex + 1

                self["myMenu"].setList(self.osdList)
                self.theFunc = "host"
            else:
                print "#### NO VIDEO LINKS FOUND"
                print url
                text = str(self["myText"].getText()) + "\n\nSpiacente, nessun servizio di streaming trovato!"
                self["myText"].setText(text)
                print self.historyInt
                self.theFunc = "movie"
            
        elif self.theFunc == "show":
            pass
        
        
        elif self.theFunc == "episode":
            pass
        
            
        elif self.theFunc == "host":
            print ">>>>>>>>>>>>host"
            print self.theFunc
            print returnValue
            returnUrl = returnValue
            returnUrl = utils.getResolverURL(returnUrl)
        
            if returnUrl:       
                fileRef = eServiceReference(4097,0,returnUrl)
                fileRef.setData(2,10240*1024)
                fileRef.setName(returnTitle)
                self.session.open(MoviePlayer, fileRef)
            else:
                return
                     
        self["myMenu"].moveToIndex(0)
        print "HOST"
        print self.theFunc