Ejemplos de BeautifulSoup.findAll en Python, ejemplos de lib.BeautifulSoup.BeautifulSoup.findAll en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: kindlereader.py Proyecto: userid/kindlereader

    def parse_summary(self, summary, link):
        """处理文章"""

        soup = BeautifulSoup(summary)

        for span in list(soup.findAll(attrs={ "style" : "display: none;" })):
            span.extract()

        for attr in self.remove_attributes:
            for x in soup.findAll(attrs={attr:True}):
                del x[attr]

        for tag in soup.findAll(self.remove_tags):
            tag.extract()

        img_count = 0
        for img in list(soup.findAll('img')):
            if (self.max_image_number >= 0  and img_count >= self.max_image_number) \
                or img.has_key('src') is False \
                or img['src'].startswith("http://union.vancl.com/") \
                or img['src'].startswith("http://www1.feedsky.com/") \
                or img['src'].startswith("http://feed.feedsky.com/~flare/"):
                img.extract()
            else:
                try:
                    localimage = self.down_image(img['src'], link)

                    if localimage:
                        img['src'] = localimage
                        img_count = img_count + 1
                    else:
                        img.extract()
                except Exception, e:
                    print e
                    img.extract()

Ejemplo n.º 2

0

Mostrar archivo

Archivo: kindlereader.py Proyecto: hitsmaxft/kindlereader

    def parse_summary(self, summary, link):
        """处理文章"""

        soup = BeautifulSoup(summary)

        for span in list(soup.findAll(attrs={"style": "display: none;"})):
            span.extract()

        for attr in self.remove_attributes:
            for x in soup.findAll(attrs={attr: True}):
                del x[attr]

        for tag in soup.findAll(self.remove_tags):
            tag.extract()

        img_count = 0
        for img in list(soup.findAll('img')):
            if (self.max_image_number >= 0  and img_count >= self.max_image_number) \
                or img.has_key('src') is False \
                or img['src'].startswith("http://union.vancl.com/") \
                or img['src'].startswith("http://www1.feedsky.com/") \
                or img['src'].startswith("http://feed.feedsky.com/~flare/"):
                img.extract()
            else:
                try:
                    localimage = self.down_image(img['src'], link)

                    if localimage:
                        img['src'] = localimage
                        img_count = img_count + 1
                    else:
                        img.extract()
                except Exception, e:
                    print e
                    img.extract()

Ejemplo n.º 3

0

Mostrar archivo

Archivo: Presentation.py Proyecto: zuojie/KKT

	def getPresentation(self):
		base_url = 'http://my.yingjiesheng.com/xuanjianghui_province_'
		for i in range(1, 35): #取出34[1-34]个省份的未来两天的招聘信息
			url = base_url + str(i) + '.html'
			#print url
			try:
				page = self.getRes(url) 
				soup = BeautifulSoup(page)
			except: #url打开失败
				continue
			#取出所有的倒计时
			try: #当前城市可能未来一段时间没有宣讲会信息
				countdowns = soup.findAll('div', {'class': 'list_topic'})
				y_m_d2, y_m_d3 = '', ''; #记录第二天和第三天的宣讲会日期
				first, second = -1, -1 #第二天和第三天的宣讲会出现的名字为campusTalk的table下标.其位置是和倒计时出现的div保持错开一个位置
				# 因为第0个名为campusTalk的table是表格标题栏，从第1个开始才是宣讲会的信息，因此day初始化为1
				day = 1 
				for countdown in countdowns:
					cd = string.atoi(countdown.contents[0].contents[2].string)
					if cd > 2: #倒计时超过2天的宣讲会，暂不考虑
						break
					elif cd == 1: #第二天要举行的宣讲会【倒计时剩1天】
						first = day
						y_m_d2 = countdown.contents[1].string
					elif cd == 2: #第三天要举行的宣讲会【倒计时剩2天】
						second = day
						y_m_d3 = countdown.contents[1].string
					day = day + 1
				# first是第2天信息，second是第三天的信息，假如为-1，表示那天没有宣讲会
				if first != -1:
					tables = soup.findAll('table', {'class':'campusTalk'})[first]
					trs = tables.findAll('tr')
					for tr in trs:
						tds = tr.findAll('td')
						city = tds[0].a.string.strip()
						school = tds[1].a.string.strip()
						addr = tds[2].string.strip()
						inc = tds[3].a.string.strip()
						try: # 有些宣讲会未标出具体开始时间[H-M-S]
							pdate = y_m_d2 + ' ' + tds[4].string
						except Exception, e:
							pdate = y_m_d2 #那么只记录年月日即可
						self.presentations.append(CPresentation(city, inc, school, pdate, addr))
				if second != -1:
					tables = soup.findAll('table', {'class':'campusTalk'})[second]
					trs = tables.findAll('tr')
					for tr in trs:
						tds = tr.findAll('td')
						city = tds[0].a.string.strip()
						school = tds[1].a.string.strip()
						addr = tds[2].string.strip()
						inc = tds[3].a.string.strip()
						try:
							pdate = y_m_d3 + ' ' + tds[4].string
						except:
							pdate = y_m_d3
						self.presentations.append(CPresentation(city, inc, school, pdate, addr))
			except:

Ejemplo n.º 4

0

Mostrar archivo

def fetchSong(url, viewCount):
    try:
        #Get song info from url
        songInfo = {}
        _get = url.split('?')[1]
        tokens = _get.split('&')
        for token in tokens:
            toks = token.split('=')
            songInfo[toks[0]] = int(toks[1])
        
        #fetch the html
        lyricsWeb = urllib2.urlopen(url)  
        webContent = lyricsWeb.read()  
        lyricsWeb.close()       
    
        soup = BeautifulSoup(webContent)
    
        lyrics = soup.findAll(id="mylrc")[0].contents
        author = soup.findAll(attrs={'class' : 'link_hb'})[0].contents[0]
        album = soup.findAll(attrs={'class' : 'link_hb'})[1].contents[0]
        title = soup.findAll(attrs={'class' : 'link_hb'})[2].contents[0]    
        
        #print lyrics
        lyricsText = ''
        for line in lyrics:
            for t in line:
                lyricsText += t                       
        
        #Construct the xml
        root = ET.Element("xml")
        doc = ET.SubElement(root, "doc")
        
        sidNode = ET.SubElement(doc, "sid")
        sidNode.text = str(songInfo[u'sid'])
        aidNode = ET.SubElement(doc, "aid")
        aidNode.text = str(songInfo[u'aid'])
        lidNode = ET.SubElement(doc, "lid")
        lidNode.text = str(songInfo[u'lid'])        
        titleNode = ET.SubElement(doc, "title")
        titleNode.text = title
        authorNode = ET.SubElement(doc, "author")
        authorNode.text = author
        viewCountNode = ET.SubElement(doc, "viewCount")
        viewCountNode.text = str(viewCount)
        lyricsNode = ET.SubElement(doc, "lyrics")
        lyricsNode.text = lyricsText
        
                       
        #Construct the tree
        tree = ET.ElementTree(root)
        filename = lyricsDbPath + str(songInfo['lid']) + ".txt"        
        tree.write(filename, "utf-8")
        
    except:
        pass

Ejemplo n.º 5

0

Mostrar archivo

Archivo: Mall.py Proyecto: zuojie/KKT

	def getResURL(self, url):
		page = urllib2.urlopen(url).read().decode('GBK').encode('utf-8')
		soup = BeautifulSoup(page)
		try:
			if self.is_book:
				search_div = soup.findAll('div', {'name': '__link_sale'})[0] #第一种DOM-TREE对应的搜索方式[针对图书影视类]
			elif not self.is_book:
				search_div = soup.findAll('div', {'class': 'goumai_anniu'})[0] #第二种DOM-Tree对应的搜索方式[针对百货类]
			else:
				search_div = "NULL"
		except Exception, e:
			self.py_log.log("获取商品信息失败", self.py_log.get_file_name(), self.py_log.get_line())
			return ""

Ejemplo n.º 6

0

Mostrar archivo

Archivo: base.py Proyecto: lovejoy/KindleEar

 def readability(self, url, decoder):
     #使用readability-lxml处理全文信息
     #因为图片文件占内存，为了节省内存，这个函数也做为生成器
     opener = URLOpener(self.host)
     result = opener.open(url)
     status_code, content = result.status_code, result.content
     if status_code != 200 or not content:
         logging.error('err(%d) to fetch %s.' % (status_code,url))
         return
         
     if self.page_encoding:
         content = content.decode(self.page_encoding)
     else:
         content = decoder.decode(content)
     
     content = self.preprocess(content)
     
     # 提取正文
     doc = Document(content)
     summary = doc.summary(html_partial=True)
     title = doc.short_title()
     title = self.processtitle(title)
     html = self.FragToXhtml(summary, title, addtitleinbody=True)
     
     #因为现在只剩文章内容了，使用BeautifulSoup也不会有什么性能问题
     if self.keep_image:
         soup = BeautifulSoup(html)
         
         for cmt in soup.findAll(text=lambda text:isinstance(text, Comment)):
             cmt.extract
         
         self.soupbeforeimage(soup)
         
         for img in soup.findAll('img'):
             imgurl = img['src']
             if not imgurl.startswith('http') and not imgurl.startswith('www'):
                 imgurl = self.urljoin(url, imgurl)
             imgresult = opener.open(imgurl)
             imgcontent = imgresult.content if imgresult.status_code == 200 else None
             if imgcontent:
                 imgtype = imghdr.what(None, imgcontent)
                 if imgtype:
                     imgmime = r"image/" + imgtype
                     if imgtype == 'jpeg':
                         fnimg = "%d.jpg" % random.randint(10000,99999999)
                     else:
                         fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype)
                     img['src'] = fnimg
                     yield (imgmime, imgurl, fnimg, imgcontent)
                     
     yield (title, None, None, html)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: Ticket.py Proyecto: zuojie/KKT

	def getRes(self):
		url = self.getResURL()
		page = urllib2.urlopen(url).read()#.decode('GBK').encode('utf-8')
		soup = BeautifulSoup(page)
		main_wrapper = soup.findAll('div', {'class': 'main_wrapper'})[0]
		#print main_wrapper.prettify()
		clr_after = main_wrapper.findAll('div', {'class': 'clr_after'})[0]
		#print clr_after.prettify()
		items = clr_after.findAll('div', {'class': 'main'})[0]
		#print items.prettify()
		items1 = items.findAll('div', {'class': 'lowpriceList'})[0]
		print items1.prettify().decode('utf-8').encode('gbk')
		items2 = items1.findAll('div', {'id': 'hdivResultTable'})[0]
		#print items2.prettify().decode('utf-8').encode('gbk')
		
		for item in items2:
			print item
			inc = str(item.findAll('td', {'class': 'col3'})[0].contents[0].string)
			fly_time = str(item.findAll('td', {'class': 'col4'})[0].contents[0].string)
			_time = str(item.findAll('td', {'class': 'col2'})[0].contents[0].string)
			_discount = str(item.findAll('span', {'class': 'disc'})[0].contents[0].string)
			_price = str(item.findAll('span', {'class': 'pr'})[0].contents[0].string)
			
			print inc#.decode('utf-8').encode('gbk')
			print fly_time#.decode('utf-8').encode('gbk')
			print _time#.decode('utf-8').encode('gbk')
			print _discount.decode('utf-8').encode('gbk')
			print _price.decode('utf-8').encode('gbk')

Ejemplo n.º 8

0

Mostrar archivo

Archivo: models.py Proyecto: OpenDSA/OpenDSA-devserver

 def render(self):
     content         =  cache.get(self.content_url) 
     
     # If the page is not cached, retrieve it
     if content == None:
         opener      = urllib2.build_opener()
         content     = opener.open(self.content_url, timeout=5).read()
         
         # Save the page in cache
         cache.set(self.content_url, content)
     
     soup            = BeautifulSoup(content)
     
     # Make links absolute, quoted from http://stackoverflow.com/a/4468467:
     for tag in soup.findAll('a', href=True):
         tag['href'] = urlparse.urljoin(self.content_url, tag['href'])
     
     # If there's no element specified, use the BODY. 
     # Otherwise find the element with given id.
     if self.element_id == "":
         html        = soup.find("body").renderContents()
     else:
         html        = str(soup.find(id=self.element_id))
     
     return html

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test.py Proyecto: Xmister/rtlmost-xbmc

def get_episodes():
	"""docstring for get_episodes"""
	
	html = retrieve_url("http://www.rtlklub.hu/most/musorok/automania")
	soup = BeautifulSoup(html, fromEncoding="utf-8")
	print soup.originalEncoding
	episodesHtml = soup.findAll("div", attrs={"class" : "video-img-cont-catchup cont-first"})
	
	""" result
	
	<div class="video-img-cont-catchup cont-first" id="5217">
		<div class="video-date">okt 24.<span>12:15</span></div>
		<a href="http://www.rtlklub.hu/most/5217_automania_09-10-24" class="video-img">
			<img src="http://www.rtlklub.hu/most/files/thumbnails/005/217/2.jpg" width="120" height="90" alt="AutÃ³mÃ¡nia 09-10-24" title="AutÃ³mÃ¡nia 09-10-24" />
		</a>
		<a href="javascript:void(0)" class="video-add" id="5217-0">
			<img src="http://www.rtlklub.hu/most/style/img/add_video_icon.png" alt="Add a kedvenceid kÃ¶zÃ©" title="Add a kedvenceid kÃ¶zÃ©" />
		</a>
		<div class="img-height-wide"></div>
		<h2>
			<a href="http://www.rtlklub.hu/most/5217_automania_09-10-24">AutÃ³mÃ¡nia 09-10-24</a>
		</h2>
		<p>Toyota Prius, Aprilia Tuono 1000R, Honda Accord 2.2 I-DTEC</p>
	</div>
	
	"""
	
	episodes = []
	#print len(episodesHtml)
	for episode in episodesHtml:
		episodes.append({"title":episode.h2.a.string, "url":episode.h2.a['href'], "thumb":episode.a.img['src']})
	#print episodes
	return episodes

Ejemplo n.º 10

0

Mostrar archivo

Archivo: views.py Proyecto: joemarct/flask-gae-app

def view_page(slug):
    page = Page.gql("WHERE slug = :1", slug)[0]
    content = BeautifulSoup(page.content)
    codes = content.findAll('pre')
    for code in codes:
        code.contents[0].replaceWith(controllers.prettify_code(code.contents[0]))
    page.content = str(content)
    return render_template('cms_view_page.html', page=page)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: default.py Proyecto: jbeluch/xbmc-plugins

 def get_genres(self, url):
     """Return the available genres from the homepage."""
     html = download_page(url)
     ul_tags = BS(html, parseOnlyThese=SS('ul', {'class': 'menu'}))
     dirs = [{'name': a.span.string,
              'url': urljoin(self.base_url, a['href'] + '&limit=0'),
              'mode': '1'} for a in ul_tags.findAll('a')]
     self.add_dirs(dirs)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: api.py Proyecto: battlehorse/snippy-backend

 def sanitize_contents(self, contents):
   soup = BeautifulSoup(contents)
   for tagname in ['script', 'meta', 'head', 'link']:
     [tag.extract() for tag in soup.findAll(tagname)]
   
   attr_re = re.compile('^on.*', re.I)
   for tag in soup.findAll():
     for attr, _ in tag.attrs:
       if attr_re.match(attr):
         del tag[attr]
   for tag in soup.findAll(attrs={'href': re.compile(r'^\s*javascript:.*', re.I)}):
     del tag['href']
   for tag in soup.findAll(attrs={'src': re.compile(r'^\s*javascript:.*', re.I)}):
     del tag['src']
     
   sanitized_contents = soup.renderContents()
   return sanitized_contents

Ejemplo n.º 13

0

Mostrar archivo

Archivo: spider.py Proyecto: knarfytrebil/MF-MANA

	def location(self,ip):
		try:
			self.current_page = self.br.open('http://www.114best.com/ip/114.aspx?w=%s' % ip)
		except Exception:
			return "Earth"
		soup = BeautifulSoup(self.current_page)
		lo = soup.findAll('div', { "id" : "output" })[0].findAll('b')[1].text.encode('utf-8','ignore')
		return lo

Ejemplo n.º 14

0

Mostrar archivo

Archivo: test_helpers.py Proyecto: JElbourne/PubCart

 def assert_no_error_message_in_response(self, response):
     """Check that response has no error messages."""
     soup = BeautifulSoup(response)
     el = soup.find("p", "alert-error")
     if el:
         self.fail("error message found in response unexpectedly: {}".format(el.contents))
     el = soup.findAll("label", "alert-error")
     if el:
         self.fail("error message found in response unexpectedly: {}".format(el.contents))

Ejemplo n.º 15

0

Mostrar archivo

Archivo: base.py Proyecto: lovejoy/KindleEar

 def Items(self):
     itemsprocessed = []
     cnt4debug = 0
     opener = URLOpener(self.host)
     decoder = AutoDecoder()
     for section, url in self.feeds:
         content = None
         cnt4debug += 1
         if IsRunInLocal and cnt4debug > 1:
             break
         
         result = opener.open(url)
         status_code, content = result.status_code, result.content
         if status_code != 200 and content:
             logging.error('err(%d) to fetch %s.' % (status_code,url))
             continue
         
         if self.feed_encoding:
             content = content.decode(self.feed_encoding)
         else:
             content = decoder.decode(content)
         
         content = self.preprocess(content)
         
         feed = feedparser.parse(content)
         for e in feed['entries']:
             # 全文RSS中如果有广告或其他不需要的内容，可以在postprocess去掉
             desc = self.postprocess(e.description)
             desc = self.FragToXhtml(desc, e.title, self.feed_encoding)
             
             if self.keep_image:
                 soup = BeautifulSoup(content)
                 self.soupbeforeimage(soup)
                 for img in soup.findAll('img'):
                     imgurl = img['src']
                     if not imgurl.startswith('http') and not imgurl.startswith('www'):
                         imgurl = self.urljoin(url, imgurl)
                     imgresult = opener.open(imgurl)
                     imgcontent = imgresult.content if imgresult.status_code == 200 else None
                     if imgcontent:
                         imgtype = imghdr.what(None, imgcontent)
                         if imgtype:
                             imgmime = r"image/" + imgtype
                             if imgtype == 'jpeg':
                                 fnimg = "%d.jpg" % random.randint(10000,99999999)
                             else:
                                 fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype)
                             img['src'] = fnimg
                             yield (imgmime, imgurl, fnimg, imgcontent)
                 self.soupprocessex(soup)
                 desc = soup.renderContents('utf-8').decode('utf-8')
                 soup = None
             
             if e.title not in itemsprocessed and desc:
                 itemsprocessed.append(e.title)
                 yield (section, e.link, e.title, desc)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: views.py Proyecto: joemarct/flask-gae-app

def view_post(category_slug, post_slug):
    category = Category.gql("WHERE slug = :1", category_slug)[0]
    all_posts = Post.all().order('-date_created')
    post = [x for x in all_posts if x.category.slug == category_slug and x.slug == post_slug][0]
    content = BeautifulSoup(post.content)
    codes = content.findAll('pre')
    for code in codes:
        code.contents[0].replaceWith(controllers.prettify_code(code.contents[0]))
    post.content = unicode(content)
    return render_template('cms_view_post.html', post=post)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: kindlereader.py Proyecto: zyjia/kindlereader

    def parse_summary(self, summary, ref):
        """处理文章内容，去除多余标签并处理图片地址"""

        soup = BeautifulSoup(summary)

        for span in list(soup.findAll(attrs={"style": "display: none;"})):
            span.extract()

        for attr in self.remove_attributes:
            for x in soup.findAll(attrs={attr: True}):
                del x[attr]

        for tag in soup.findAll(self.remove_tags):
            tag.extract()

        img_count = 0
        images = []
        for img in list(soup.findAll('img')):
            if (krconfig.max_image_per_article >= 0  and img_count >= krconfig.max_image_per_article) \
                or img.has_key('src') is False :
                img.extract()
            else:
                try:
                    if img['src'].encode('utf-8').lower().endswith(
                        ('jpg', 'jpeg', 'gif', 'png', 'bmp')):
                        localimage, fullname = self.parse_image(img['src'])
                        # 确定结尾为图片后缀，防止下载非图片文件（如用于访问分析的假图片）
                        if os.path.isfile(fullname) is False:
                            images.append({
                                'url': img['src'],
                                'filename': fullname,
                                'referer': ref
                            })
                        if localimage:
                            img['src'] = localimage
                            img_count = img_count + 1
                        else:
                            img.extract()
                    else:
                        img.extract()
                except Exception, e:
                    logging.info("error: %s" % e)
                    img.extract()

Ejemplo n.º 18

0

Mostrar archivo

Archivo: kindlereader.py Proyecto: notxx/kindlereader

    def parse_summary(self, summary, ref):
        """处理文章内容，去除多余标签并处理图片地址"""

        soup = BeautifulSoup(summary)

        for span in list(soup.findAll(attrs = { "style" : "display: none;" })):
            span.extract()

        for attr in self.remove_attributes:
            for x in soup.findAll(attrs = {attr:True}):
                del x[attr]

        for tag in soup.findAll(self.remove_tags):
            tag.extract()

        img_count = 0
        images = []
        for img in list(soup.findAll('img')):
            if (krconfig.max_image_per_article >= 0  and img_count >= krconfig.max_image_per_article) \
                or img.has_key('src') is False :
                img.extract()
            else:
                try:
                    if img['src'].encode('utf-8').lower().endswith(('jpg', 'jpeg', 'gif', 'png', 'bmp')):
                        localimage, fullname = self.parse_image(img['src'])
                        # 确定结尾为图片后缀，防止下载非图片文件（如用于访问分析的假图片）
                        if os.path.isfile(fullname) is False:
                            images.append({
                                'url':img['src'],
                                'filename':fullname,
                                'referer':ref
                                })
                        if localimage:
                            img['src'] = localimage
                            img_count = img_count + 1
                        else:
                            img.extract()
                    else:
                        img.extract()
                except Exception, e:
                    logging.info("error: %s" % e)
                    img.extract()

Ejemplo n.º 19

0

Mostrar archivo

Archivo: app.py Proyecto: mshafrir/michaelshafrir.com

def get_refresh_url(page_content):
    try:
        page_soup = BeautifulSoup(page_content)
        for meta_tag in page_soup.findAll('meta'):
            if meta_tag['http-equiv'].lower() == 'refresh':
                refresh_url = meta_tag['content'].split('URL=')[1]
                return refresh_url
    except:
        pass

    return None

Ejemplo n.º 20

0

Mostrar archivo

Archivo: test_helpers.py Proyecto: JElbourne/PubCart

    def assert_warning_message_in_response(self, response, message=""):
        """Check if response contains one or more warning messages.

		Assume warning messages rendered as <p class="alert-warning"> elements.
		"""
        soup = BeautifulSoup(response)
        alert = soup.findAll("p", "alert-warning")
        self.assertGreater(len(alert), 0, "no warning message found in response")
        if message:
            found = str(alert[0]).find(message)
            self.assertGreater(found, 0)

Ejemplo n.º 21

0

Mostrar archivo

Archivo: adtaily2csv.py Proyecto: kosciak/kosciak-misc

def parse_page(writer, catalogue, page=1):
    print 'Parsing page %s' % page
    
    url = urllib.urlopen(URL % (catalogue, page))
    soup = BeautifulSoup(url)
    
    table = soup.find('table', attrs={'class': 'snippets'})
    for tr in table.findAll('tr'):
        # get name of the page
        name = tr.td.h4.a.string
        
        # get URL of the page
        url = tr.td.h4.a['href'].encode('utf-8')
        
        #get stats info
        stats = '?'
        stats_element = tr.find('p', attrs={'class': 'Stats'})
        if stats_element:
            stats = stats_element.strong.nextSibling.string[1:-11].replace(' ', '')
            if stats == 'wtrakc': 
                stats = '?'
        
        # get price
        price = tr.find('td', attrs={'class': 'Price'}).strong.string[0:-12]
        
        # calculate CPM
        cpm = '?'
        try:
            cpm = (float(price)*30) / int(stats) * 1000
        except:
            cpm = '?'
        
        # write to the file
        row = [name, url, stats, price.replace('.', ','), str(cpm).replace('.', ',')]
        print row
        writer.writerow(row)
    
    # find last page of the catalogue
    anchors = soup.findAll('a', href=re.compile('/networks/[0-9]+/websites\?page=[0-9]+'))
    if not anchors:
        return
    
    pages = []
    for anchor in anchors:
        number = re.match('/networks/[0-9]+/websites\?page=([0-9]+)', anchor['href']).group(1)
        pages.append(int(number))

    pages.sort()
    last = pages[-1]
    
    # parse next page if exists
    if last > page:
        next = page + 1
        parse_page(writer, catalogue, next)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: test_helpers.py Proyecto: JElbourne/PubCart

    def assert_has_div_with_ID(self, response, id_attr):
        """Check if response contains a Div with a particular ID attribute.

		<div id="<some-id>"> elements.
		"""
        soup = BeautifulSoup(response)
        alert = soup.findAll("div", id=id_attr)
        if alert:
            self.assertGreater(len(alert), 0, "No Div tag with, id=%s, in response" % str(id_attr))
        else:
            self.fail("No Div tag with, id=%s, in response" % str(id_attr))

Ejemplo n.º 23

0

Mostrar archivo

Archivo: scraper.py Proyecto: thaohoang94/antplanner

def strip_professors(html, name):
	"""Returns list of professor matches"""
	profs = []
	
	table = BeautifulSoup(html).find('div', {'id': 'ratingTable'})
	if table is None:
		logging.debug(html[500:])
		return profs

	split = name[:-1].upper().split(',')
	qLast = split[0]
 	try:
		qFirst = split[1]
	except:
		qFirst = ''
			
	rows = table.findAll('div', {'class': re.compile(r"entry (odd|even)")})

	for row in rows:
		divName = row.find('div', {'class': 'profName'})
		anchor = divName.find('a')
		profName = unicode(anchor.renderContents().strip(), 'utf-8', 'ignore').upper()
		
		try:
			firstName = profName.split(',')[1]
		except:
			firstName = ''
			
		# logging.info('Searching against: ' + profName)
		
		if profName.startswith(qLast) and qFirst in firstName:						
			href = 'http://www.ratemyprofessors.com/' + anchor['href'].strip()
			profDept = row.find('div', {'class': 'profDept'}).renderContents().strip()
			profRatings = row.find('div', {'class': 'profRatings'}).renderContents().strip()
			profQuality = row.find('div', {'class': 'profAvg'}).renderContents().strip()
			profEasiness = row.find('div', {'class': 'profEasy'}).renderContents().strip()
			profHot = row.find('div', {'class': re.compile(r".*\bprofHot\b.*")}).renderContents().strip()
			
			if profHot == 'Hot':
				profHot = '&#x2713;'
			else:
				profHot = '&nbsp;'

			profs.append({
				'name': profName,
				'href': href,
				'dept': profDept,
				'ratings': profRatings,
				'quality': profQuality,
				'easiness': profEasiness,
				'hot': profHot
			})

	return profs

Ejemplo n.º 24

0

Mostrar archivo

Archivo: yahoo.py Proyecto: mshafrir/Rotoist

def league_settings(league_id, access_code):
	response = urlfetch.fetch("http://football.fantasysports.yahoo.com/f1/%s/settings?pak=%s" % (league_id, access_code))
	settings_table_soup = BeautifulSoup(response.content).find("table", attrs={'id': 'settings-table'})
	
	positions = defaultdict(int)
	for p in [str(s.strip()) for s in settings_table_soup.findAll('tr')[23].find('td', attrs={'width': '410'}).b.contents[0].split(',')]:
		positions[p] += 1
	
	#bench_spots = roster_positions.count('BN')
	
	return positions

Ejemplo n.º 25

0

Mostrar archivo

Archivo: scraper.py Proyecto: thaohoang94/antplanner

def strip_search(html):
	form_html = BeautifulSoup(html).find('form', action='http://websoc.reg.uci.edu/')
	
	#replace form submit with our own link
	form_html['action'] = '/schedules'
	
	#remove 'Display Text Results' button
	text_buttons = form_html.findAll(attrs={"class" : "banner-width"})
	for i in text_buttons:
		i.replaceWith('<p id=\"submit-container\"><input type="submit" value="Display Results" name="Submit"></p>')
	
	return str(form_html)

Ejemplo n.º 26

0

Mostrar archivo

Archivo: default.py Proyecto: erix/mugasha-xbmc

def get_playlist(path):
	"""Fetches the playlist for a DJ set"""


	#parse the playlist
	playlistXml = retrieve_url(BASE_URL+get_asset_path(path)+'.xml')
	soup = BeautifulSoup(playlistXml)
	tracks = soup.findAll('track')
	playlist = []
	for track in tracks :
		playlist.append({"title":unescape(track.title.string), "artist":unescape(track.creator.string), "file":track.location.string, "start":track.meta.nextSibling})
	return playlist

Ejemplo n.º 27

0

Mostrar archivo

Archivo: default.py Proyecto: erix/mugasha-xbmc

def get_sets():
	"""Retrieves the most popular sets from Mugasha"""
#	usock = open(BASE_CURRENT_SOURCE_PATH, "r")
	
	html = retrieve_url(BASE_URL+"/browse/sets")
	soup = BeautifulSoup(html)
	setsHtml = soup.findAll('div', attrs={"class" : "setTabs set-sel-weekly"})
	sets = []
	for	setHtml in setsHtml:
		anchors = setHtml.findAll('a')
		sets.append({"title":anchors[1].string, "thumb_url":anchors[2].img['src'], "browse_path":anchors[1]['href']})
	return sets

Ejemplo n.º 28

0

Mostrar archivo

Archivo: yahoo.py Proyecto: mshafrir/Rotoist

def parse_matchup_info(league, team_id):
	logging.info("team_id: %d" % team_id)
	
	generic_matchup_url = build_url(league_id=league.id, page='matchup', params={'mid1': team_id}, access_code=league.access_code)
	try:
		matchup_soup = BeautifulSoup(urlfetch.fetch(generic_matchup_url).content).find('div', attrs={'class': 'scoreboard'}).find('li')
	except:
		matchup_soup = None	
		
	logging.info("\n\n\n%s\n\n\n" % matchup_soup.prettify())
	
	if matchup_soup:
		team_names = [str(row.find('a').contents[0]).strip() for row in matchup_soup.findAll('tr')]
		score = [float(pts.contents[0]) for pts in matchup_soup.findAll('td', attrs={'class': 'pts'})]
	else:
		team_names = None
		score = None
	
	if team_names and score:
		return {'score': score, 'team_names': team_names}
	else:
		return None

Ejemplo n.º 29

0

Mostrar archivo

Archivo: scraper.py Proyecto: styfle/antplanner

def strip_professors(html, name):
	table = BeautifulSoup(html).find('div', {'id': 'ratingTable'})
	if table is None:
		logging.debug(html[500:])
		return get_rmp_error('Parse Error','Could not find "ratingTable" at RateMyProfessors.com')
	else:
		profs = list()
		#name = name.upper()
		split = name.split(',');
		qLastName = split[0].strip()
		qFirstName = split[1].strip()
		if (qFirstName == None or qFirstName == ''):
			qFirstName = '!'
		rows = table.findAll('div', {'class': re.compile(r".*\bentry\b.*")})
		for row in rows:
			divName = row.find('div', {'class': 'profName'})
			anchor = divName.find('a')
			profName = unicode(anchor.renderContents().strip(), 'utf-8', 'ignore').upper()
			split = profName.split(',');
			lastName = split[0].strip()
			firstName = split[1].strip()
			if (firstName == None or firstName == ''):
				firstName = '!'
			#logging.debug(qLastName + ' =? ' + lastName + ' && ' + qFirstName + ' =? ' + firstName)
			if lastName == qLastName and firstName[0] == qFirstName[0]:
				href = 'http://www.ratemyprofessors.com/' + anchor['href'].strip()
				profDept = row.find('div', {'class': 'profDept'}).renderContents().strip()
				profRatings = row.find('div', {'class': 'profRatings'}).renderContents().strip()
				profQuality = row.find('div', {'class': 'profAvg'}).renderContents().strip()
				profEasiness = row.find('div', {'class': 'profEasy'}).renderContents().strip()
				profHot = row.find('div', {'class': re.compile(r".*\bprofHot\b.*")}).renderContents().strip()
				if profHot == 'Hot':
					profHot = '&#x2713;'
				else:
					profHot = '&nbsp;'
				
				prof = {
					'name': profName,
					'href': href,
					'dept': profDept,
					'ratings': profRatings,
					'quality': profQuality,
					'easiness': profEasiness,
					'hot': profHot
				}
				#logging.debug(prof)
				profs.append(prof)
		return json.dumps(profs)

Ejemplo n.º 30

0

Mostrar archivo

Archivo: test_helpers.py Proyecto: JElbourne/PubCart

    def assert_error_message_in_response(self, response, message=""):
        """Check if response contains one or more error messages.

		Assume error messages rendered as <p class="alert-error"> elements.
		"""
        soup = BeautifulSoup(response)
        # logging.info(soup)
        alert = soup.findAll("p", "alert-error")
        logging.info(alert)
        if len(alert) > 0:
            pass
        else:
            self.fail("no error message found in response")
        if message:
            found = str(alert[0]).find(message)
            self.assertGreater(found, 0)

Ejemplo n.º 31

0

Mostrar archivo

Archivo: cron.py Proyecto: mshafrir/Junkscast

    def get(self):
        self.response.headers['Content-Type'] = 'text/plain'

        day = date.today() - relativedelta(days=1)
        response = fetch_url("http://podcastrss.play.it/the-sports-junkies_mp3_128.xml")
        if response and response.status_code == 200:
            feed_soup = BeautifulSoup(response.content)
            [copyright_el.extract() for copyright_el in feed_soup.findAll("copyright")]

            self.response.out.write("%s\n\n\n" % feed_soup.prettify())
            DailyFeedSnapshot.create(day, feed_soup.prettify())
            msg = "Created a DailyFeedSnapshot for %s." % (day)
            self.response.out.write(msg)
            logging.info(msg)
        else:
            msg = "Could not create a DailyFeedSnapshot for %s." % (day)
            self.response.out.write(msg)
            logging.error(msg)

Ejemplo n.º 32

0

Mostrar archivo

Archivo: crawlParse.py Proyecto: zhanghappycoding/LearningCrawl

def clawdata(data):
    data = urllib.urlencode(data)
    url = "http://www.powerball.com/powerball/pb_nbr_history.asp"

    response = urllib2.urlopen(url, data)
    soup = BeautifulSoup(response)

    for tag in soup.findAll(valign="middle"):
        csoup = BeautifulSoup(str(tag))
        dictIssue = dict()
        dictIssue["issueDate"] = ""
        dictIssue["luckNum"] = [];
        if csoup.tr != None:
            for tag in csoup.tr.findAll('td'):
                if re.search("[0-9]+\/[0-9]+\/[0-9]{4}", str(tag.text)):
                    dictIssue["issueDate"] = str(tag.text)
                elif str(tag.text) != "&nbsp;":
                    dictIssue["luckNum"].append(int(tag.text))
            print dictIssue

Ejemplo n.º 33

0

Mostrar archivo

def clawdata(data):
    data = urllib.urlencode(data)
    url = "http://www.powerball.com/powerball/pb_nbr_history.asp"

    response = urllib2.urlopen(url, data)
    soup = BeautifulSoup(response)

    for tag in soup.findAll(valign="middle"):
        csoup = BeautifulSoup(str(tag))
        dictIssue = dict()
        dictIssue["issueDate"] = ""
        dictIssue["luckNum"] = []
        if csoup.tr != None:
            for tag in csoup.tr.findAll('td'):
                if re.search("[0-9]+\/[0-9]+\/[0-9]{4}", str(tag.text)):
                    dictIssue["issueDate"] = str(tag.text)
                elif str(tag.text) != "&nbsp;":
                    dictIssue["luckNum"].append(int(tag.text))
            print dictIssue

Ejemplo n.º 34

0

Mostrar archivo

def parse_organic_contents(raw_content, organic_pos):
    data_dict = {}
    data_dict['position'] = organic_pos

    b = BeautifulSoup(raw_content)
    rtitle = b.find('a')
    headline = p.sub('', str(rtitle))
    data_dict['title'] = headline

    display_url = parse_display_url(str(raw_content))
    data_dict['display_url'] = display_url

    rhref = b.find('a', href=True)
    url = str(rhref['href'])
    data_dict['url'] = ul.unquote(url)

    rtext = b.findAll('div', {'class': 's'})
    text = p.sub('', str(rtext))
    data_dict['text'] = text.replace(']', '').replace('[', '')
    return data_dict

Ejemplo n.º 35

0

Mostrar archivo

Archivo: getStatisticsFromYT.py Proyecto: shuboc/WebMiningFinal

def getViewCount(songTitle):

    try:
        youtube = 'http://gdata.youtube.com/feeds/api/videos?v=2&max-results=1&q='
        #songTitle = urllib2.quote(songTitle)
        #print songTitle
        url = youtube + songTitle
        #print url

        web = urllib2.urlopen(url)
        content = web.read()
        web.close()

        soup = BeautifulSoup(content)
        stats = soup.findAll('yt:statistics')

        return int(stats[0]['viewcount'])

    except:
        return 0

Ejemplo n.º 36

0

Mostrar archivo

def TPB(book=None, test=False):
    errmsg = ''
    provider = "TPB"
    host = lazylibrarian.CONFIG['TPB_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host + "/s/?")

    cat = 0  # 601=ebooks, 102=audiobooks, 0=all, no mag category
    if 'library' in book:
        if book['library'] == 'AudioBook':
            cat = 102
        elif book['library'] == 'eBook':
            cat = 601
        elif book['library'] == 'magazine':
            cat = 0

    sterm = makeUnicode(book['searchterm'])

    page = 0
    results = []
    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    next_page = True

    while next_page:

        params = {
            "q": book['searchterm'],
            "category": cat,
            "page": page,
            "orderby": "99"
        }

        searchURL = providerurl + "?%s" % urllib.urlencode(params)

        next_page = False
        result, success = fetchURL(searchURL)

        if not success:
            # may return 404 if no results, not really an error
            if '404' in result:
                logger.debug("No results found from %s for %s" %
                             (provider, sterm))
                success = True
            else:
                logger.debug(searchURL)
                logger.debug('Error fetching data from %s: %s' %
                             (provider, result))
                errmsg = result
            result = False

        if test:
            return success

        if result:
            logger.debug('Parsing results from <a href="%s">%s</a>' %
                         (searchURL, provider))
            soup = BeautifulSoup(result)
            # tpb uses a named table
            table = soup.find('table', id='searchResult')
            if table:
                rows = table.findAll('tr')
            else:
                rows = []

            if len(rows) > 1:
                rows = rows[1:]  # first row is headers
            for row in rows:
                td = row.findAll('td')
                if len(td) > 2:
                    try:
                        title = unaccented(
                            str(td[1]).split('title=')[1].split('>')[1].split(
                                '<')[0])
                        magnet = str(td[1]).split('href="')[1].split('"')[0]
                        size = unaccented(
                            td[1].text.split(', Size ')[1].split('iB')[0])
                        size = size.replace('&nbsp;', '')
                        mult = 1
                        try:
                            if 'K' in size:
                                size = size.split('K')[0]
                                mult = 1024
                            elif 'M' in size:
                                size = size.split('M')[0]
                                mult = 1024 * 1024
                            elif 'G' in size:
                                size = size.split('G')[0]
                                mult = 1024 * 1024 * 1024
                            size = int(float(size) * mult)
                        except (ValueError, IndexError):
                            size = 0
                        try:
                            seeders = int(td[2].text)
                        except ValueError:
                            seeders = 0

                        if minimumseeders < int(seeders):
                            # no point in asking for magnet link if not enough seeders
                            magurl = '%s/%s' % (host, magnet)
                            result, success = fetchURL(magurl)
                            if not success:
                                logger.debug('Error fetching url %s, %s' %
                                             (magurl, result))
                            else:
                                magnet = None
                                new_soup = BeautifulSoup(result)
                                for link in new_soup.findAll('a'):
                                    output = link.get('href')
                                    if output and output.startswith('magnet'):
                                        magnet = output
                                        break
                            if not magnet or not title:
                                logger.debug('Missing magnet or title')
                            else:
                                results.append({
                                    'bookid':
                                    book['bookid'],
                                    'tor_prov':
                                    provider,
                                    'tor_title':
                                    title,
                                    'tor_url':
                                    magnet,
                                    'tor_size':
                                    str(size),
                                    'tor_type':
                                    'magnet',
                                    'priority':
                                    lazylibrarian.CONFIG['TPB_DLPRIORITY']
                                })
                                logger.debug('Found %s. Size: %s' %
                                             (title, size))
                                next_page = True
                        else:
                            logger.debug('Found %s but %s seeder%s' %
                                         (title, seeders, plural(seeders)))
                    except Exception as e:
                        logger.error("An error occurred in the %s parser: %s" %
                                     (provider, str(e)))
                        logger.debug('%s: %s' %
                                     (provider, traceback.format_exc()))

        page += 1
        if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page:
            logger.warn(
                'Maximum results page search reached, still more results available'
            )
            next_page = False

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))
    return results, errmsg

Ejemplo n.º 37

0

Mostrar archivo

def GEN(book=None, prov=None):
    errmsg = ''
    provider = "libgen.io"
    if prov is None:
        prov = 'GEN'
    host = lazylibrarian.CONFIG[prov + '_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    search = lazylibrarian.CONFIG[prov + '_SEARCH']
    if not search or not search.endswith('.php'):
        search = 'search.php'
    if 'index.php' not in search and 'search.php' not in search:
        search = 'search.php'
    if search[0] == '/':
        search = search[1:]

    page = 1
    results = []
    next_page = True

    while next_page:
        if 'index.php' in search:
            params = {
                "s": book['searchterm'],
                "f_lang": "All",
                "f_columns": 0,
                "f_ext": "All"
            }
        else:
            params = {
                "view": "simple",
                "open": 0,
                "phrase": 0,
                "column": "def",
                "res": 100,
                "req": book['searchterm']
            }

        if page > 1:
            params['page'] = page

        providerurl = url_fix(host + "/%s" % search)
        searchURL = providerurl + "?%s" % urllib.urlencode(params)

        next_page = False
        result, success = fetchURL(searchURL)
        if not success:
            # may return 404 if no results, not really an error
            if '404' in result:
                logger.debug(u"No results found from %s for %s" %
                             (provider, book['searchterm']))
            elif '111' in result:
                # looks like libgen has ip based access limits
                logger.error(
                    'Access forbidden. Please wait a while before trying %s again.'
                    % provider)
                errmsg = result
            else:
                logger.debug(searchURL)
                logger.debug('Error fetching page data from %s: %s' %
                             (provider, result))
                errmsg = result
            result = False

        if result:
            logger.debug(u'Parsing results from <a href="%s">%s</a>' %
                         (searchURL, provider))
            try:
                soup = BeautifulSoup(result)
                try:
                    table = soup.findAll('table')[2]  # un-named table
                    if table:
                        rows = table.findAll('tr')
                except IndexError:  # no results table in result page
                    rows = []

                if 'search.php' in search and len(rows) > 1:
                    rows = rows[1:]

                for row in rows:
                    author = ''
                    title = ''
                    size = ''
                    extn = ''
                    link = ''
                    td = row.findAll('td')
                    if 'index.php' in search and len(td) > 3:
                        try:
                            res = str(
                                BeautifulStoneSoup(
                                    td[0].text,
                                    convertEntities=BeautifulStoneSoup.
                                    HTML_ENTITIES))
                            author = formatAuthorName(res)
                            title = str(
                                BeautifulStoneSoup(
                                    td[2].text,
                                    convertEntities=BeautifulStoneSoup.
                                    HTML_ENTITIES))
                            temp = str(td[4])
                            temp = temp.split('onmouseout')[1]
                            extn = temp.split('">')[1].split('(')[0]
                            size = temp.split('">')[1].split('(')[1].split(
                                ')')[0]
                            size = size.upper()
                            link = temp.split('href=')[1].split('"')[1]
                        except IndexError as e:
                            logger.debug(
                                'Error parsing libgen index.php results: %s' %
                                str(e))

                    elif 'search.php' in search and len(td) > 8:
                        try:
                            res = str(
                                BeautifulStoneSoup(
                                    td[1].text,
                                    convertEntities=BeautifulStoneSoup.
                                    HTML_ENTITIES))
                            author = formatAuthorName(res)
                            title = str(
                                td[2]).split('>')[2].split('<')[0].strip()
                            title = str(
                                BeautifulStoneSoup(
                                    title,
                                    convertEntities=BeautifulStoneSoup.
                                    HTML_ENTITIES))
                            link = str(td[2]).split('href="')[1].split(
                                '?')[1].split('"')[0]
                            size = unaccented(td[7].text).upper()
                            extn = td[8].text
                        except IndexError as e:
                            logger.debug(
                                'Error parsing libgen search.php results; %s' %
                                str(e))

                    if not size:
                        size = 0
                    else:
                        try:
                            mult = 1
                            if 'K' in size:
                                size = size.split('K')[0]
                                mult = 1024
                            elif 'M' in size:
                                size = size.split('M')[0]
                                mult = 1024 * 1024
                            elif 'G' in size:
                                size = size.split('G')[0]
                                mult = 1024 * 1024 * 1024
                            size = int(float(size) * mult)
                        except (ValueError, IndexError):
                            size = 0

                    if link and title:
                        if author:
                            title = author.strip() + ' ' + title.strip()
                        if extn:
                            title = title + '.' + extn

                        if not link.startswith('http'):
                            if "/ads.php?" in link:
                                url = url_fix(host + link)
                            else:
                                url = url_fix(host + "/ads.php?" + link)
                        else:
                            url = redirect_url(host, link)

                        bookresult, success = fetchURL(url)
                        if not success:
                            # may return 404 if no results, not really an error
                            if '404' in bookresult:
                                logger.debug(
                                    u"No results found from %s for %s" %
                                    (provider, book['searchterm']))
                            else:
                                logger.debug(url)
                                logger.debug(
                                    'Error fetching link data from %s: %s' %
                                    (provider, bookresult))
                                errmsg = bookresult
                            bookresult = False

                        if bookresult:
                            url = None
                            try:
                                new_soup = BeautifulSoup(bookresult)
                                for link in new_soup.findAll('a'):
                                    output = link.get('href')
                                    if output:
                                        if output.startswith(
                                                'http'
                                        ) and '/get.php' in output:
                                            url = output
                                            break
                                        elif '/get.php' in output:
                                            url = '/get.php' + output.split(
                                                '/get.php')[1]
                                            break
                                        elif '/download/book' in output:
                                            url = '/download/book' + output.split(
                                                '/download/book')[1]
                                            break

                                if url and not url.startswith('http'):
                                    url = url_fix(host + url)
                                else:
                                    url = redirect_url(host, url)
                            except Exception as e:
                                logger.debug(
                                    'Error parsing bookresult for %s: %s' %
                                    (link, str(e)))
                                url = None

                        if url:
                            results.append({
                                'bookid':
                                book['bookid'],
                                'tor_prov':
                                provider + '/' + search,
                                'tor_title':
                                title,
                                'tor_url':
                                url,
                                'tor_size':
                                str(size),
                                'tor_type':
                                'direct',
                                'priority':
                                lazylibrarian.CONFIG[prov + '_DLPRIORITY']
                            })
                            logger.debug('Found %s, Size %s' % (title, size))
                        next_page = True

            except Exception as e:
                logger.error(u"An error occurred in the %s parser: %s" %
                             (provider, str(e)))
                logger.debug('%s: %s' % (provider, traceback.format_exc()))

        page += 1
        if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page:
            logger.warn(
                'Maximum results page search reached, still more results available'
            )
            next_page = False

    logger.debug(
        u"Found %i result%s from %s for %s" %
        (len(results), plural(len(results)), provider, book['searchterm']))
    return results, errmsg

Ejemplo n.º 38

0

Mostrar archivo

########################start of main###################################

for i in range(startId, endId):
    
    url = "http://lyrics.oiktv.com/singer.php?sid=" + str(i)

    #lyricsWeb = urllib2.urlopen("http://lyrics.oiktv.com/singer.php?sid=51")  
    lyricsWeb = urllib2.urlopen(url)
    
    webContent = lyricsWeb.read()  
    lyricsWeb.close()  
    
    soup = BeautifulSoup(webContent)
    
    pages = soup.findAll('a')
    wantedPages = []
    for page in pages:        
        if re.search("&page=", page['href']):
            #print page['href']
            url = page['href']
            wantedPages.append(url)
            
    if len(wantedPages) > 1: #find those who has more than 20 albums    
        
        maxPageNum = 1 #Max 1 page for each singer
        pageNum = 0
        maxSongNum = 250
        songNum = 0  
        fetchNum = 0

Ejemplo n.º 39

0

Mostrar archivo

Archivo: snippet.py Proyecto: szabo92/gistable

# -*- coding:utf-8 -*-
import re
import urllib2
from lib.BeautifulSoup import BeautifulSoup

agent="""Sosospider+(+http://help.soso.com/webspider.htm)"""

blog_url = 'http://blog.sina.com.cn/s/articlelist_1517582220_0_1.html'
spider_handle = urllib2.urlopen(blog_url)
blog_content = spider_handle.read()
soup = BeautifulSoup(blog_content, fromEncoding='utf-8')
item_list = soup.findAll('span', {'class':'atc_title'})

urls = ['http://blog.csdn.net/heiyeshuwu/archive/2010/12/19/6085876.aspx']
#for item in item_list:
#    urls.append(item.a['href'])
    
for url in urls:
    request = urllib2.Request(url)
    request.add_header('User-Agent', agent)
    handle = urllib2.urlopen(request).read()
    article_soup = BeautifulSoup(handle, fromEncoding='utf-8')
    title = article_soup.find('h1',{'class':'title_txt'})
    content = article_soup.find('div',{'id':'sina_keyword_ad_area2'})
#    tmp = []
#    for c  in content.contents:
#        print type(c)
#        tmp.append(c.__str__('utf-8'))
    print url
    print title.contents
    print title.contents[2].replace('\t', '').replace('\r\n', '')

Ejemplo n.º 40

0

Mostrar archivo

def KAT(book=None, test=False):
    errmsg = ''
    provider = "KAT"
    host = lazylibrarian.CONFIG['KAT_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host + "/usearch/" +
                          urllib.quote(book['searchterm']))

    params = {"category": "books", "field": "seeders", "sorder": "desc"}
    searchURL = providerurl + "/?%s" % urllib.urlencode(params)

    sterm = makeUnicode(book['searchterm'])

    result, success = fetchURL(searchURL)
    if not success:
        # seems KAT returns 404 if no results, not really an error
        if '404' in result:
            logger.debug("No results found from %s for %s" % (provider, sterm))
            success = True
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' %
                         (provider, result))
            errmsg = result
        result = False

    if test:
        return success

    results = []

    if result:
        logger.debug('Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
        soup = BeautifulSoup(result)
        rows = []
        try:
            table = soup.findAll('table')[1]  # un-named table
            if table:
                rows = table.findAll('tr')
        except IndexError:  # no results table in result page
            rows = []

        if len(rows) > 1:
            rows = rows[1:]  # first row is headers

        for row in rows:
            td = row.findAll('td')
            if len(td) > 3:
                try:
                    title = unaccented(
                        str(td[0]).split('cellMainLink">')[1].split('<')[0])
                    # kat can return magnet or torrent or both.
                    magnet = ''
                    url = ''
                    mode = 'torrent'
                    try:
                        magnet = 'magnet' + str(
                            td[0]).split('href="magnet')[1].split('"')[0]
                        mode = 'magnet'
                    except IndexError:
                        pass
                    try:
                        url = 'http' + str(td[0]).split('href="http')[1].split(
                            '.torrent?')[0] + '.torrent'
                        mode = 'torrent'
                    except IndexError:
                        pass

                    if not url or (magnet and url
                                   and lazylibrarian.CONFIG['PREFER_MAGNET']):
                        url = magnet
                        mode = 'magnet'

                    try:
                        size = str(td[1].text).replace('&nbsp;', '').upper()
                        mult = 1
                        if 'K' in size:
                            size = size.split('K')[0]
                            mult = 1024
                        elif 'M' in size:
                            size = size.split('M')[0]
                            mult = 1024 * 1024
                        elif 'G' in size:
                            size = size.split('G')[0]
                            mult = 1024 * 1024 * 1024
                        size = int(float(size) * mult)
                    except (ValueError, IndexError):
                        size = 0
                    try:
                        seeders = int(td[3].text)
                    except ValueError:
                        seeders = 0

                    if not url or not title:
                        logger.debug('Missing url or title')
                    elif minimumseeders < int(seeders):
                        results.append({
                            'bookid':
                            book['bookid'],
                            'tor_prov':
                            provider,
                            'tor_title':
                            title,
                            'tor_url':
                            url,
                            'tor_size':
                            str(size),
                            'tor_type':
                            mode,
                            'priority':
                            lazylibrarian.CONFIG['KAT_DLPRIORITY']
                        })
                        logger.debug('Found %s. Size: %s' % (title, size))
                    else:
                        logger.debug('Found %s but %s seeder%s' %
                                     (title, seeders, plural(seeders)))
                except Exception as e:
                    logger.error("An error occurred in the %s parser: %s" %
                                 (provider, str(e)))
                    logger.debug('%s: %s' % (provider, traceback.format_exc()))

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))
    return results, errmsg

Ejemplo n.º 41

0

Mostrar archivo

def TDL(book=None, test=False):
    errmsg = ''
    provider = "torrentdownloads"
    host = lazylibrarian.CONFIG['TDL_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host)

    params = {"type": "search", "cid": "2", "search": book['searchterm']}
    searchURL = providerurl + "/rss.xml?%s" % urllib.urlencode(params)

    sterm = makeUnicode(book['searchterm'])

    data, success = fetchURL(searchURL)
    if not success:
        # may return 404 if no results, not really an error
        if '404' in data:
            logger.debug("No results found from %s for %s" % (provider, sterm))
            success = True
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' % (provider, data))
            errmsg = data
        data = False

    if test:
        return success

    results = []

    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    if data:
        logger.debug('Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        d = feedparser.parse(data)
        if len(d.entries):
            for item in d.entries:
                try:
                    title = item['title']
                    seeders = int(item['seeders'])
                    link = item['link']
                    size = int(item['size'])
                    url = None

                    if link and minimumseeders < int(seeders):
                        # no point requesting the magnet link if not enough seeders
                        # TDL gives us a relative link
                        result, success = fetchURL(providerurl + link)
                        if success:
                            new_soup = BeautifulSoup(result)
                            for link in new_soup.findAll('a'):
                                output = link.get('href')
                                if output and output.startswith('magnet'):
                                    url = output
                                    break

                        if not url or not title:
                            logger.debug('Missing url or title')
                        else:
                            results.append({
                                'bookid':
                                book['bookid'],
                                'tor_prov':
                                provider,
                                'tor_title':
                                title,
                                'tor_url':
                                url,
                                'tor_size':
                                str(size),
                                'tor_type':
                                'magnet',
                                'priority':
                                lazylibrarian.CONFIG['TDL_DLPRIORITY']
                            })
                            logger.debug('Found %s. Size: %s' % (title, size))
                    else:
                        logger.debug('Found %s but %s seeder%s' %
                                     (title, seeders, plural(seeders)))

                except Exception as e:
                    logger.error("An error occurred in the %s parser: %s" %
                                 (provider, str(e)))
                    logger.debug('%s: %s' % (provider, traceback.format_exc()))

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))

    return results, errmsg

Ejemplo n.º 42

0

Mostrar archivo

def WWT(book=None, test=False):
    errmsg = ''
    provider = "WorldWideTorrents"
    host = lazylibrarian.CONFIG['WWT_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host + "/torrents-search.php")

    sterm = makeUnicode(book['searchterm'])

    cat = 0  # 0=all, 36=ebooks, 52=mags, 56=audiobooks
    if 'library' in book:
        if book['library'] == 'AudioBook':
            cat = 56
        elif book['library'] == 'eBook':
            cat = 36
        elif book['library'] == 'magazine':
            cat = 52

    page = 0
    results = []
    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    next_page = True

    while next_page:
        params = {"search": book['searchterm'], "page": page, "cat": cat}
        searchURL = providerurl + "/?%s" % urllib.urlencode(params)

        next_page = False
        result, success = fetchURL(searchURL)
        if not success:
            # might return 404 if no results, not really an error
            if '404' in result:
                logger.debug("No results found from %s for %s" %
                             (provider, sterm))
                success = True
            else:
                logger.debug(searchURL)
                logger.debug('Error fetching data from %s: %s' %
                             (provider, result))
                errmsg = result
            result = False

        if test:
            return success

        if result:
            logger.debug('Parsing results from <a href="%s">%s</a>' %
                         (searchURL, provider))
            soup = BeautifulSoup(result)

            try:
                tables = soup.findAll('table')  # un-named table
                table = tables[2]
                if table:
                    rows = table.findAll('tr')
            except IndexError:  # no results table in result page
                rows = []

            if len(rows) > 1:
                rows = rows[1:]  # first row is headers

            for row in rows:
                td = row.findAll('td')
                if len(td) > 3:
                    try:
                        title = unaccented(
                            str(td[0]).split('title="')[1].split('"')[0])
                        # can return magnet or torrent or both.
                        magnet = ''
                        url = ''
                        mode = 'torrent'
                        try:
                            magnet = 'magnet' + str(
                                td[0]).split('href="magnet')[1].split('"')[0]
                            mode = 'magnet'
                        except IndexError:
                            pass
                        try:
                            url = url_fix(host + '/download.php') + \
                                          str(td[0]).split('href="download.php')[1].split('.torrent"')[0] + '.torrent'
                            mode = 'torrent'
                        except IndexError:
                            pass

                        if not url or (magnet and url and
                                       lazylibrarian.CONFIG['PREFER_MAGNET']):
                            url = magnet
                            mode = 'magnet'

                        try:
                            size = str(td[1].text).replace('&nbsp;',
                                                           '').upper()
                            mult = 1
                            if 'K' in size:
                                size = size.split('K')[0]
                                mult = 1024
                            elif 'M' in size:
                                size = size.split('M')[0]
                                mult = 1024 * 1024
                            elif 'G' in size:
                                size = size.split('G')[0]
                                mult = 1024 * 1024 * 1024
                            size = int(float(size) * mult)
                        except (ValueError, IndexError):
                            size = 0
                        try:
                            seeders = int(td[2].text)
                        except ValueError:
                            seeders = 0

                        if not url or not title:
                            logger.debug('Missing url or title')
                        elif minimumseeders < int(seeders):
                            results.append({
                                'bookid':
                                book['bookid'],
                                'tor_prov':
                                provider,
                                'tor_title':
                                title,
                                'tor_url':
                                url,
                                'tor_size':
                                str(size),
                                'tor_type':
                                mode,
                                'priority':
                                lazylibrarian.CONFIG['WWT_DLPRIORITY']
                            })
                            logger.debug('Found %s. Size: %s' % (title, size))
                            next_page = True
                        else:
                            logger.debug('Found %s but %s seeder%s' %
                                         (title, seeders, plural(seeders)))
                    except Exception as e:
                        logger.error("An error occurred in the %s parser: %s" %
                                     (provider, str(e)))
                        logger.debug('%s: %s' %
                                     (provider, traceback.format_exc()))
        page += 1
        if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page:
            logger.warn(
                'Maximum results page search reached, still more results available'
            )
            next_page = False

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))
    return results, errmsg

Ejemplo n.º 43

0

Mostrar archivo

def view():		
	addon_handle = int(sys.argv[1])
	addon       = xbmcaddon.Addon()
	addonname   = addon.getAddonInfo('name')
	
	args = urlparse.parse_qs(sys.argv[2][1:])

	xbmcplugin.setContent(addon_handle, 'movies')

	cat=args.get('cat', None)
	page = args.get('page', None)
	link = args.get('link', None)	
	
	catalogues=[{'label':'\x56\x69\x64\x65\x6F\x20\x4D\xE1\xBB\x9B\x69'.decode('utf-8'),'id':'video/new/'},
				{'label':'Video Hot','id':'video/hot/'}]
	#play link
	if link!=None:
		link_video=link[0]
		if link_video.startswith(web_url):
			r = requests.get(link[0])
			html = r.text
			#xbmc.log(html.encode('utf-8'))
			soup = BeautifulSoup(html)
			video_src=soup.find('embed', attrs={'id':'zplayer'})
			video_flashvars=video_src.get('flashvars')
			args_video = urlparse.parse_qs(video_flashvars)
			link_video=args_video['file'][0]					
		xbmc.Player().play(link_video)
		return
	#Load cats
	if cat==None:
		for cat in catalogues:
			li = xbmcgui.ListItem(cat['label'])
			urlList = CMDTools.build_url(base_url,{'web':get_Web_Name(), 'cat':cat['id']})
			xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li, isFolder=True)	
		xbmc.executebuiltin('Container.SetViewMode(501)')		 			
		xbmcplugin.endOfDirectory(addon_handle)
		return
	#Load noi dung cat
	if cat!=None:
		if page==None:
			page=1
		else:
			page=int(page[0])
		r = requests.get(web_url+cat[0]+str(page))
		html = r.text		
		xbmc.log(html.encode('utf-8'))
		soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)			
		data_List=soup.findAll('a',attrs={'class':'play'})
		#load item menu
		for item in data_List:			
			link_item=web_url+item.get('href')			
			if item.get('data-youtubeid')!='':
				link_item="plugin://plugin.video.youtube/play/?video_id="+item.get('data-youtubeid')
			img_item=item.find('img')
			img_src=img_item.get('src')
			img_alt=img_item.get('alt')
			
			li = xbmcgui.ListItem(img_alt)
			
			li.setThumbnailImage(img_src)
			li.setInfo(type='image',infoLabels="")					
			
			urlList = CMDTools.build_url(base_url,{'web':get_Web_Name(), 'link':link_item, 'type':cat[0]})
			xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li)			
		
		#Tao nut next	
		li = xbmcgui.ListItem("Next")	
		urlList=CMDTools.build_url(base_url,{'web':web_name, 'cat':cat[0],'page': page+1});
		xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li, isFolder=True)	
		
		xbmc.executebuiltin('Container.SetViewMode(501)')
		#xbmc.executebuiltin("ClearSlideshow")		
		#xbmc.executebuiltin("SlideShow(,,notrandom)")		
		xbmcplugin.endOfDirectory(addon_handle)
		return
					
	xbmcplugin.endOfDirectory(addon_handle)

Ejemplo n.º 44

0

Mostrar archivo

def TDL(book=None):

    provider = "torrentdownloads"
    host = lazylibrarian.TDL_HOST
    if not str(host)[:4] == "http":
        host = 'http://' + host

    providerurl = url_fix(host)

    params = {"type": "search", "cid": "2", "search": book['searchterm']}
    searchURL = providerurl + "/rss.xml?%s" % urllib.urlencode(params)

    try:
        request = urllib2.Request(searchURL)
        if lazylibrarian.PROXY_HOST:
            request.set_proxy(lazylibrarian.PROXY_HOST,
                              lazylibrarian.PROXY_TYPE)
        request.add_header('User-Agent', USER_AGENT)
        data = urllib2.urlopen(request, timeout=90)
    except (socket.timeout) as e:
        logger.debug('Timeout fetching data from %s' % provider)
        data = False
    except (urllib2.HTTPError, urllib2.URLError, ssl.SSLError) as e:
        # may return 404 if no results, not really an error
        if hasattr(e, 'code') and e.code == 404:
            logger.debug(searchURL)
            logger.debug(u"No results found from %s for %s" %
                         (provider, book['searchterm']))
        else:
            logger.debug(searchURL)
            if hasattr(e, 'reason'):
                errmsg = e.reason
            else:
                errmsg = str(e)
            logger.debug('Error fetching data from %s: %s' %
                         (provider, errmsg))
        data = False

    results = []

    minimumseeders = int(lazylibrarian.NUMBEROFSEEDERS) - 1
    if data:
        logger.debug(u'Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        d = feedparser.parse(data)
        if len(d.entries):
            for item in d.entries:
                try:
                    title = item['title']
                    seeders = int(item['seeders'])
                    link = item['link']
                    size = int(item['size'])
                    url = None

                    if link and minimumseeders < seeders:
                        # no point requesting the magnet link if not enough seeders
                        request = urllib2.Request(link)
                        if lazylibrarian.PROXY_HOST:
                            request.set_proxy(lazylibrarian.PROXY_HOST,
                                              lazylibrarian.PROXY_TYPE)
                        request.add_header('User-Agent', USER_AGENT)

                        conn = urllib2.urlopen(request, timeout=90)
                        result = conn.read()
                        url = None
                        new_soup = BeautifulSoup(result)
                        for link in new_soup.findAll('a'):
                            output = link.get('href')
                            if output and output.startswith('magnet'):
                                url = output
                                break

                    if minimumseeders < int(seeders):
                        if not url or not title:
                            logger.debug('Missing url or title')
                        else:
                            results.append({
                                'bookid': book['bookid'],
                                'tor_prov': provider,
                                'tor_title': title,
                                'tor_url': url,
                                'tor_size': str(size),
                            })
                            logger.debug('Found %s. Size: %s' % (title, size))
                    else:
                        logger.debug('Found %s but %s seeder%s' %
                                     (title, seeders, plural(seeders)))

                except Exception as e:
                    logger.error(u"An error occurred in the %s parser: %s" %
                                 (provider, str(e)))

    logger.debug(
        u"Found %i result%s from %s for %s" %
        (len(results), plural(len(results)), provider, book['searchterm']))
    return results

Ejemplo n.º 45

0

Mostrar archivo

Archivo: torrentparser.py Proyecto: kuuratsanik/LazyLibrarian

def KAT(book=None):

    provider = "KAT"
    host = lazylibrarian.CONFIG['KAT_HOST']
    if not str(host)[:4] == "http":
        host = 'http://' + host

    providerurl = url_fix(host + "/usearch/" + book['searchterm'])

    params = {"category": "books", "field": "seeders", "sorder": "desc"}
    searchURL = providerurl + "/?%s" % urllib.urlencode(params)

    result, success = fetchURL(searchURL)
    if not success:
        # seems KAT returns 404 if no results, not really an error
        if '404' in result:
            logger.debug(u"No results found from %s for %s" %
                         (provider, book['searchterm']))
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' %
                         (provider, result))
        result = False

    results = []

    if result:
        logger.debug(u'Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
        soup = BeautifulSoup(result)

        try:
            table = soup.findAll('table')[1]
            rows = table.findAll('tr')
        except Exception:  # no results = no table in result page
            rows = []

        c0 = []
        c1 = []
        c3 = []

        if len(rows) > 1:
            for row in rows[1:]:
                if len(row.findAll('td')) > 3:
                    c0.append(row.findAll('td')[0])
                    c1.append(row.findAll('td')[1])
                    c3.append(row.findAll('td')[3])

        for col0, col1, col3 in zip(c0, c1, c3):
            try:
                title = unaccented(
                    str(col0).split('cellMainLink">')[1].split('<')[0])
                # kat can return magnet or torrent or both.
                magnet = ''
                url = ''
                mode = 'torrent'
                try:
                    magnet = 'magnet' + str(col0).split(
                        'href="magnet')[1].split('"')[0]
                    mode = 'magnet'
                except IndexError:
                    pass
                try:
                    url = 'http' + str(col0).split('href="http')[1].split(
                        '.torrent?')[0] + '.torrent'
                    mode = 'torrent'
                except IndexError:
                    pass

                if not url or (magnet and url
                               and lazylibrarian.CONFIG['PREFER_MAGNET']):
                    url = magnet
                    mode = 'magnet'

                try:
                    size = str(col1.text).replace('&nbsp;', '').upper()
                    mult = 1
                    if 'K' in size:
                        size = size.split('K')[0]
                        mult = 1024
                    elif 'M' in size:
                        size = size.split('M')[0]
                        mult = 1024 * 1024
                    size = int(float(size) * mult)
                except (ValueError, IndexError):
                    size = 0
                try:
                    seeders = int(col3.text)
                except ValueError:
                    seeders = 0

                if not url or not title:
                    logger.debug('Missing url or title')
                elif minimumseeders < seeders:
                    results.append({
                        'bookid': book['bookid'],
                        'tor_prov': provider,
                        'tor_title': title,
                        'tor_url': url,
                        'tor_size': str(size),
                        'tor_type': mode
                    })
                    logger.debug('Found %s. Size: %s' % (title, size))
                else:
                    logger.debug('Found %s but %s seeder%s' %
                                 (title, seeders, plural(seeders)))
            except Exception as e:
                logger.error(u"An error occurred in the %s parser: %s" %
                             (provider, str(e)))

    logger.debug(
        u"Found %i result%s from %s for %s" %
        (len(results), plural(len(results)), provider, book['searchterm']))
    return results

Ejemplo n.º 46

0

Mostrar archivo

Archivo: torrentparser.py Proyecto: kuuratsanik/LazyLibrarian

def TPB(book=None):

    provider = "TPB"
    host = lazylibrarian.CONFIG['TPB_HOST']
    if not str(host)[:4] == "http":
        host = 'http://' + host

    providerurl = url_fix(host + "/s/?q=" + book['searchterm'])

    params = {"category": "601", "page": "0", "orderby": "99"}
    searchURL = providerurl + "&%s" % urllib.urlencode(params)

    result, success = fetchURL(searchURL)
    if not success:
        # may return 404 if no results, not really an error
        if '404' in result:
            logger.debug(u"No results found from %s for %s" %
                         (provider, book['searchterm']))
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' %
                         (provider, result))
        result = False

    results = []

    if result:
        logger.debug(u'Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
        soup = BeautifulSoup(result)
        try:
            table = soup.findAll('table')[0]
            rows = table.findAll('tr')
        except Exception:  # no results = no table in result page
            rows = []

        c1 = []
        c2 = []

        if len(rows) > 1:
            for row in rows[1:]:
                if len(row.findAll('td')) > 2:
                    c1.append(row.findAll('td')[1])
                    c2.append(row.findAll('td')[2])

        for col1, col2 in zip(c1, c2):
            try:
                title = unaccented(
                    str(col1).split('title=')[1].split('>')[1].split('<')[0])
                magnet = str(col1).split('href="')[1].split('"')[0]
                size = unaccented(col1.text.split(', Size ')[1].split('iB')[0])
                mult = 1
                try:
                    if 'K' in size:
                        size = size.split('K')[0]
                        mult = 1024
                    elif 'M' in size:
                        size = size.split('M')[0]
                        mult = 1024 * 1024
                    size = int(float(size) * mult)
                except (ValueError, IndexError):
                    size = 0
                try:
                    seeders = int(col2.text)
                except ValueError:
                    seeders = 0

                if minimumseeders < seeders:
                    # no point in asking for magnet link if not enough seeders
                    magurl = '%s/%s' % (host, magnet)
                    result, success = fetchURL(magurl)
                    if not success:
                        logger.debug('Error fetching url %s, %s' %
                                     (magurl, result))
                    else:
                        magnet = None
                        new_soup = BeautifulSoup(result)
                        for link in new_soup.findAll('a'):
                            output = link.get('href')
                            if output and output.startswith('magnet'):
                                magnet = output
                                break
                    if not magnet or not title:
                        logger.debug('Missing magnet or title')
                    else:
                        if minimumseeders < seeders:
                            results.append({
                                'bookid': book['bookid'],
                                'tor_prov': provider,
                                'tor_title': title,
                                'tor_url': magnet,
                                'tor_size': str(size),
                                'tor_type': 'magnet'
                            })
                            logger.debug('Found %s. Size: %s' % (title, size))
                        else:
                            logger.debug('Found %s but %s seeder%s' %
                                         (title, seeders, plural(seeders)))
                else:
                    logger.debug('Found %s but %s seeder%s' %
                                 (title, seeders, plural(seeders)))
            except Exception as e:
                logger.error(u"An error occurred in the %s parser: %s" %
                             (provider, str(e)))

    logger.debug(
        u"Found %i result%s from %s for %s" %
        (len(results), plural(len(results)), provider, book['searchterm']))
    return results

Ejemplo n.º 47

0

Mostrar archivo

Archivo: torrentparser.py Proyecto: kuuratsanik/LazyLibrarian

def GEN(book=None):

    provider = "libgen"
    host = lazylibrarian.CONFIG['GEN_HOST']
    if not str(host)[:4] == "http":
        host = 'http://' + host

    searchURL = url_fix(
        host +
        "/search.php?view=simple&open=0&phrase=0&column=def&res=100&req=" +
        book['searchterm'])

    result, success = fetchURL(searchURL)
    if not success:
        # may return 404 if no results, not really an error
        if '404' in result:
            logger.debug(u"No results found from %s for %s" %
                         (provider, book['searchterm']))
        elif '111' in result:
            # looks like libgen has ip based access limits
            logger.error(
                'Access forbidden. Please wait a while before trying %s again.'
                % provider)
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' %
                         (provider, result))
        result = False

    results = []

    if result:
        logger.debug(u'Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        soup = BeautifulSoup(result)
        try:
            table = soup.findAll('table')[2]
            rows = table.findAll('tr')
        except Exception:  # no results = no table in result page
            rows = []

        c1 = []
        c2 = []
        c7 = []
        c8 = []

        if len(rows) > 1:
            for row in rows[1:]:
                if len(row.findAll('td')) > 8:
                    c1.append(row.findAll('td')[1])
                    c2.append(row.findAll('td')[2])
                    c7.append(row.findAll('td')[7])
                    c8.append(row.findAll('td')[8])

        for col1, col2, col7, col8 in zip(c1, c2, c7, c8):
            try:
                author = unaccented(col1.text)
                title = unaccented(
                    str(col2).split('>')[2].split('<')[0].strip())
                link = str(col2).split('href="')[1].split('?')[1].split('"')[0]
                size = unaccented(col7.text).upper()
                extn = col8.text

                try:
                    mult = 1
                    if 'K' in size:
                        size = size.split('K')[0]
                        mult = 1024
                    elif 'M' in size:
                        size = size.split('M')[0]
                        mult = 1024 * 1024
                    size = int(float(size) * mult)
                except (ValueError, IndexError):
                    size = 0

                if link and title:
                    if author:
                        title = author.strip() + ' ' + title.strip()
                    if extn:
                        title = title + '.' + extn

                    bookURL = url_fix(host + "/ads.php?" + link)
                    bookresult, success = fetchURL(bookURL)
                    if not success:
                        # may return 404 if no results, not really an error
                        if '404' in bookresult:
                            logger.debug(u"No results found from %s for %s" %
                                         (provider, book['searchterm']))
                        else:
                            logger.debug(bookURL)
                            logger.debug('Error fetching data from %s: %s' %
                                         (provider, bookresult))
                        bookresult = False
                    if bookresult:
                        url = None
                        new_soup = BeautifulSoup(bookresult)
                        for link in new_soup.findAll('a'):
                            output = link.get('href')
                            if output and output.startswith('/get.php'):
                                url = output
                                break

                        if url:
                            url = url_fix(host + url)
                            results.append({
                                'bookid': book['bookid'],
                                'tor_prov': provider,
                                'tor_title': title,
                                'tor_url': url,
                                'tor_size': str(size),
                                'tor_type': 'direct'
                            })
                            logger.debug('Found %s, Size %s' % (title, size))

            except Exception as e:
                logger.error(u"An error occurred in the %s parser: %s" %
                             (provider, str(e)))

    logger.debug(
        u"Found %i result%s from %s for %s" %
        (len(results), plural(len(results)), provider, book['searchterm']))
    return results