Esempio n. 1
0
def panel(url):
        link=araclar.get_url(web)
        soup=BS(link.decode('utf-8','ignore'))
        div = soup.findAll("div",{"class":"blok-liste"})
        for li in div[int(url)].findAll('li'):#-------------dizi anasayfalari bulur
                url= li.a['href']
                name = li.a.text
                name=name.encode("utf-8")
                araclar.addDir(fileName,name,"kategoriler(url)",url,"YOK") 
Esempio n. 2
0
    def run(self):
        #resp = urllib2.urlopen(self.url)
        #print self.url, resp.getcode()
        req = urllib2.Request(url=self.url,headers=headers)
        content = urllib2.urlopen(req)
        soup = BeautifulSOAP(content,fromEncoding="gb18030")
#print soup.originalEncoding
#print  soup.prettify()
        songlist = soup.findAll('a',{'href':re.compile(r'/song/(\d)+')})
#print dir(songlist[0])
        for song in songlist:
            song_url=''
            song_url= 'www.xiami.com' + song.get('href')
            print song_url ,song.string
Esempio n. 3
0
def geturladdress(keywords,type,number,filename):
     urltitle = {}
     pageid = string.atoi(number)/100
     for idpage in range(0,pageid,1):
          entirehtml= getresponse(keywords,type,idpage*100)
          soup = BeautifulSOAP(entirehtml)
          results = soup.findAll('li', {'class': 'g'})
          for result in results:
               title_a = result.find('a')
               if not title_a:
                    continue
               else:
                    title = ''.join(title_a.findAll(text=True))
                    title = html_unescape(title)
                    #print title
                    url = title_a['href']
                    #print url
                    url = getdomain(url)
                    urltitle[title]= url
     writetofile(filename,urltitle)
Esempio n. 4
0
def geturladdress(keywords, type, number, filename):
    urltitle = {}
    pageid = string.atoi(number) / 100
    for idpage in range(0, pageid, 1):
        entirehtml = getresponse(keywords, type, idpage * 100)
        soup = BeautifulSOAP(entirehtml)
        results = soup.findAll('li', {'class': 'g'})
        for result in results:
            title_a = result.find('a')
            if not title_a:
                continue
            else:
                title = ''.join(title_a.findAll(text=True))
                title = html_unescape(title)
                #print title
                url = title_a['href']
                #print url
                url = getdomain(url)
                urltitle[title] = url
    writetofile(filename, urltitle)
Esempio n. 5
0


import re,urllib,urllib2
from BeautifulSoup import BeautifulSOAP

url= 'http://www.xiami.com/artist/top/id/1234'

headers = {'User-Agent':"Mozilla/5.0 (Windows NT 5.1; rv:27.0) Gecko/20100101 Firefox/27.0"}
req = urllib2.Request(url=url,headers=headers)
content = urllib2.urlopen(req)
soup = BeautifulSOAP(content,fromEncoding="gb18030")
#print soup.originalEncoding
#print  soup.prettify()

songlist = soup.findAll('a',{'href':re.compile(r'/song/(\d)+')})
#print dir(songlist[0])
for song in songlist:
    song_url=''
    song_url= 'www.xiami.com' + song.get('href')
    print song_url ,song.string

#songlist = re.findall(pattern,string)
#songlist = re.findall(pattern,content)
#for song in songlist:
#    print song
#    
print "end"