def parse_summary(self, summary, link): """处理文章""" soup = BeautifulSoup(summary) for span in list(soup.findAll(attrs={ "style" : "display: none;" })): span.extract() for attr in self.remove_attributes: for x in soup.findAll(attrs={attr:True}): del x[attr] for tag in soup.findAll(self.remove_tags): tag.extract() img_count = 0 for img in list(soup.findAll('img')): if (self.max_image_number >= 0 and img_count >= self.max_image_number) \ or img.has_key('src') is False \ or img['src'].startswith("http://union.vancl.com/") \ or img['src'].startswith("http://www1.feedsky.com/") \ or img['src'].startswith("http://feed.feedsky.com/~flare/"): img.extract() else: try: localimage = self.down_image(img['src'], link) if localimage: img['src'] = localimage img_count = img_count + 1 else: img.extract() except Exception, e: print e img.extract()
def parse_summary(self, summary, link): """处理文章""" soup = BeautifulSoup(summary) for span in list(soup.findAll(attrs={"style": "display: none;"})): span.extract() for attr in self.remove_attributes: for x in soup.findAll(attrs={attr: True}): del x[attr] for tag in soup.findAll(self.remove_tags): tag.extract() img_count = 0 for img in list(soup.findAll('img')): if (self.max_image_number >= 0 and img_count >= self.max_image_number) \ or img.has_key('src') is False \ or img['src'].startswith("http://union.vancl.com/") \ or img['src'].startswith("http://www1.feedsky.com/") \ or img['src'].startswith("http://feed.feedsky.com/~flare/"): img.extract() else: try: localimage = self.down_image(img['src'], link) if localimage: img['src'] = localimage img_count = img_count + 1 else: img.extract() except Exception, e: print e img.extract()
def getPresentation(self): base_url = 'http://my.yingjiesheng.com/xuanjianghui_province_' for i in range(1, 35): #取出34[1-34]个省份的未来两天的招聘信息 url = base_url + str(i) + '.html' #print url try: page = self.getRes(url) soup = BeautifulSoup(page) except: #url打开失败 continue #取出所有的倒计时 try: #当前城市可能未来一段时间没有宣讲会信息 countdowns = soup.findAll('div', {'class': 'list_topic'}) y_m_d2, y_m_d3 = '', ''; #记录第二天和第三天的宣讲会日期 first, second = -1, -1 #第二天和第三天的宣讲会出现的名字为campusTalk的table下标.其位置是和倒计时出现的div保持错开一个位置 # 因为第0个名为campusTalk的table是表格标题栏,从第1个开始才是宣讲会的信息,因此day初始化为1 day = 1 for countdown in countdowns: cd = string.atoi(countdown.contents[0].contents[2].string) if cd > 2: #倒计时超过2天的宣讲会,暂不考虑 break elif cd == 1: #第二天要举行的宣讲会【倒计时剩1天】 first = day y_m_d2 = countdown.contents[1].string elif cd == 2: #第三天要举行的宣讲会【倒计时剩2天】 second = day y_m_d3 = countdown.contents[1].string day = day + 1 # first是第2天信息,second是第三天的信息,假如为-1,表示那天没有宣讲会 if first != -1: tables = soup.findAll('table', {'class':'campusTalk'})[first] trs = tables.findAll('tr') for tr in trs: tds = tr.findAll('td') city = tds[0].a.string.strip() school = tds[1].a.string.strip() addr = tds[2].string.strip() inc = tds[3].a.string.strip() try: # 有些宣讲会未标出具体开始时间[H-M-S] pdate = y_m_d2 + ' ' + tds[4].string except Exception, e: pdate = y_m_d2 #那么只记录年月日即可 self.presentations.append(CPresentation(city, inc, school, pdate, addr)) if second != -1: tables = soup.findAll('table', {'class':'campusTalk'})[second] trs = tables.findAll('tr') for tr in trs: tds = tr.findAll('td') city = tds[0].a.string.strip() school = tds[1].a.string.strip() addr = tds[2].string.strip() inc = tds[3].a.string.strip() try: pdate = y_m_d3 + ' ' + tds[4].string except: pdate = y_m_d3 self.presentations.append(CPresentation(city, inc, school, pdate, addr)) except:
def fetchSong(url, viewCount): try: #Get song info from url songInfo = {} _get = url.split('?')[1] tokens = _get.split('&') for token in tokens: toks = token.split('=') songInfo[toks[0]] = int(toks[1]) #fetch the html lyricsWeb = urllib2.urlopen(url) webContent = lyricsWeb.read() lyricsWeb.close() soup = BeautifulSoup(webContent) lyrics = soup.findAll(id="mylrc")[0].contents author = soup.findAll(attrs={'class' : 'link_hb'})[0].contents[0] album = soup.findAll(attrs={'class' : 'link_hb'})[1].contents[0] title = soup.findAll(attrs={'class' : 'link_hb'})[2].contents[0] #print lyrics lyricsText = '' for line in lyrics: for t in line: lyricsText += t #Construct the xml root = ET.Element("xml") doc = ET.SubElement(root, "doc") sidNode = ET.SubElement(doc, "sid") sidNode.text = str(songInfo[u'sid']) aidNode = ET.SubElement(doc, "aid") aidNode.text = str(songInfo[u'aid']) lidNode = ET.SubElement(doc, "lid") lidNode.text = str(songInfo[u'lid']) titleNode = ET.SubElement(doc, "title") titleNode.text = title authorNode = ET.SubElement(doc, "author") authorNode.text = author viewCountNode = ET.SubElement(doc, "viewCount") viewCountNode.text = str(viewCount) lyricsNode = ET.SubElement(doc, "lyrics") lyricsNode.text = lyricsText #Construct the tree tree = ET.ElementTree(root) filename = lyricsDbPath + str(songInfo['lid']) + ".txt" tree.write(filename, "utf-8") except: pass
def getResURL(self, url): page = urllib2.urlopen(url).read().decode('GBK').encode('utf-8') soup = BeautifulSoup(page) try: if self.is_book: search_div = soup.findAll('div', {'name': '__link_sale'})[0] #第一种DOM-TREE对应的搜索方式[针对图书影视类] elif not self.is_book: search_div = soup.findAll('div', {'class': 'goumai_anniu'})[0] #第二种DOM-Tree对应的搜索方式[针对百货类] else: search_div = "NULL" except Exception, e: self.py_log.log("获取商品信息失败", self.py_log.get_file_name(), self.py_log.get_line()) return ""
def readability(self, url, decoder): #使用readability-lxml处理全文信息 #因为图片文件占内存,为了节省内存,这个函数也做为生成器 opener = URLOpener(self.host) result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 or not content: logging.error('err(%d) to fetch %s.' % (status_code,url)) return if self.page_encoding: content = content.decode(self.page_encoding) else: content = decoder.decode(content) content = self.preprocess(content) # 提取正文 doc = Document(content) summary = doc.summary(html_partial=True) title = doc.short_title() title = self.processtitle(title) html = self.FragToXhtml(summary, title, addtitleinbody=True) #因为现在只剩文章内容了,使用BeautifulSoup也不会有什么性能问题 if self.keep_image: soup = BeautifulSoup(html) for cmt in soup.findAll(text=lambda text:isinstance(text, Comment)): cmt.extract self.soupbeforeimage(soup) for img in soup.findAll('img'): imgurl = img['src'] if not imgurl.startswith('http') and not imgurl.startswith('www'): imgurl = self.urljoin(url, imgurl) imgresult = opener.open(imgurl) imgcontent = imgresult.content if imgresult.status_code == 200 else None if imgcontent: imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype if imgtype == 'jpeg': fnimg = "%d.jpg" % random.randint(10000,99999999) else: fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype) img['src'] = fnimg yield (imgmime, imgurl, fnimg, imgcontent) yield (title, None, None, html)
def getRes(self): url = self.getResURL() page = urllib2.urlopen(url).read()#.decode('GBK').encode('utf-8') soup = BeautifulSoup(page) main_wrapper = soup.findAll('div', {'class': 'main_wrapper'})[0] #print main_wrapper.prettify() clr_after = main_wrapper.findAll('div', {'class': 'clr_after'})[0] #print clr_after.prettify() items = clr_after.findAll('div', {'class': 'main'})[0] #print items.prettify() items1 = items.findAll('div', {'class': 'lowpriceList'})[0] print items1.prettify().decode('utf-8').encode('gbk') items2 = items1.findAll('div', {'id': 'hdivResultTable'})[0] #print items2.prettify().decode('utf-8').encode('gbk') for item in items2: print item inc = str(item.findAll('td', {'class': 'col3'})[0].contents[0].string) fly_time = str(item.findAll('td', {'class': 'col4'})[0].contents[0].string) _time = str(item.findAll('td', {'class': 'col2'})[0].contents[0].string) _discount = str(item.findAll('span', {'class': 'disc'})[0].contents[0].string) _price = str(item.findAll('span', {'class': 'pr'})[0].contents[0].string) print inc#.decode('utf-8').encode('gbk') print fly_time#.decode('utf-8').encode('gbk') print _time#.decode('utf-8').encode('gbk') print _discount.decode('utf-8').encode('gbk') print _price.decode('utf-8').encode('gbk')
def render(self): content = cache.get(self.content_url) # If the page is not cached, retrieve it if content == None: opener = urllib2.build_opener() content = opener.open(self.content_url, timeout=5).read() # Save the page in cache cache.set(self.content_url, content) soup = BeautifulSoup(content) # Make links absolute, quoted from http://stackoverflow.com/a/4468467: for tag in soup.findAll('a', href=True): tag['href'] = urlparse.urljoin(self.content_url, tag['href']) # If there's no element specified, use the BODY. # Otherwise find the element with given id. if self.element_id == "": html = soup.find("body").renderContents() else: html = str(soup.find(id=self.element_id)) return html
def get_episodes(): """docstring for get_episodes""" html = retrieve_url("http://www.rtlklub.hu/most/musorok/automania") soup = BeautifulSoup(html, fromEncoding="utf-8") print soup.originalEncoding episodesHtml = soup.findAll("div", attrs={"class" : "video-img-cont-catchup cont-first"}) """ result <div class="video-img-cont-catchup cont-first" id="5217"> <div class="video-date">okt 24.<span>12:15</span></div> <a href="http://www.rtlklub.hu/most/5217_automania_09-10-24" class="video-img"> <img src="http://www.rtlklub.hu/most/files/thumbnails/005/217/2.jpg" width="120" height="90" alt="Autómánia 09-10-24" title="Autómánia 09-10-24" /> </a> <a href="javascript:void(0)" class="video-add" id="5217-0"> <img src="http://www.rtlklub.hu/most/style/img/add_video_icon.png" alt="Add a kedvenceid közé" title="Add a kedvenceid közé" /> </a> <div class="img-height-wide"></div> <h2> <a href="http://www.rtlklub.hu/most/5217_automania_09-10-24">Autómánia 09-10-24</a> </h2> <p>Toyota Prius, Aprilia Tuono 1000R, Honda Accord 2.2 I-DTEC</p> </div> """ episodes = [] #print len(episodesHtml) for episode in episodesHtml: episodes.append({"title":episode.h2.a.string, "url":episode.h2.a['href'], "thumb":episode.a.img['src']}) #print episodes return episodes
def view_page(slug): page = Page.gql("WHERE slug = :1", slug)[0] content = BeautifulSoup(page.content) codes = content.findAll('pre') for code in codes: code.contents[0].replaceWith(controllers.prettify_code(code.contents[0])) page.content = str(content) return render_template('cms_view_page.html', page=page)
def get_genres(self, url): """Return the available genres from the homepage.""" html = download_page(url) ul_tags = BS(html, parseOnlyThese=SS('ul', {'class': 'menu'})) dirs = [{'name': a.span.string, 'url': urljoin(self.base_url, a['href'] + '&limit=0'), 'mode': '1'} for a in ul_tags.findAll('a')] self.add_dirs(dirs)
def sanitize_contents(self, contents): soup = BeautifulSoup(contents) for tagname in ['script', 'meta', 'head', 'link']: [tag.extract() for tag in soup.findAll(tagname)] attr_re = re.compile('^on.*', re.I) for tag in soup.findAll(): for attr, _ in tag.attrs: if attr_re.match(attr): del tag[attr] for tag in soup.findAll(attrs={'href': re.compile(r'^\s*javascript:.*', re.I)}): del tag['href'] for tag in soup.findAll(attrs={'src': re.compile(r'^\s*javascript:.*', re.I)}): del tag['src'] sanitized_contents = soup.renderContents() return sanitized_contents
def location(self,ip): try: self.current_page = self.br.open('http://www.114best.com/ip/114.aspx?w=%s' % ip) except Exception: return "Earth" soup = BeautifulSoup(self.current_page) lo = soup.findAll('div', { "id" : "output" })[0].findAll('b')[1].text.encode('utf-8','ignore') return lo
def assert_no_error_message_in_response(self, response): """Check that response has no error messages.""" soup = BeautifulSoup(response) el = soup.find("p", "alert-error") if el: self.fail("error message found in response unexpectedly: {}".format(el.contents)) el = soup.findAll("label", "alert-error") if el: self.fail("error message found in response unexpectedly: {}".format(el.contents))
def Items(self): itemsprocessed = [] cnt4debug = 0 opener = URLOpener(self.host) decoder = AutoDecoder() for section, url in self.feeds: content = None cnt4debug += 1 if IsRunInLocal and cnt4debug > 1: break result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 and content: logging.error('err(%d) to fetch %s.' % (status_code,url)) continue if self.feed_encoding: content = content.decode(self.feed_encoding) else: content = decoder.decode(content) content = self.preprocess(content) feed = feedparser.parse(content) for e in feed['entries']: # 全文RSS中如果有广告或其他不需要的内容,可以在postprocess去掉 desc = self.postprocess(e.description) desc = self.FragToXhtml(desc, e.title, self.feed_encoding) if self.keep_image: soup = BeautifulSoup(content) self.soupbeforeimage(soup) for img in soup.findAll('img'): imgurl = img['src'] if not imgurl.startswith('http') and not imgurl.startswith('www'): imgurl = self.urljoin(url, imgurl) imgresult = opener.open(imgurl) imgcontent = imgresult.content if imgresult.status_code == 200 else None if imgcontent: imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype if imgtype == 'jpeg': fnimg = "%d.jpg" % random.randint(10000,99999999) else: fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype) img['src'] = fnimg yield (imgmime, imgurl, fnimg, imgcontent) self.soupprocessex(soup) desc = soup.renderContents('utf-8').decode('utf-8') soup = None if e.title not in itemsprocessed and desc: itemsprocessed.append(e.title) yield (section, e.link, e.title, desc)
def view_post(category_slug, post_slug): category = Category.gql("WHERE slug = :1", category_slug)[0] all_posts = Post.all().order('-date_created') post = [x for x in all_posts if x.category.slug == category_slug and x.slug == post_slug][0] content = BeautifulSoup(post.content) codes = content.findAll('pre') for code in codes: code.contents[0].replaceWith(controllers.prettify_code(code.contents[0])) post.content = unicode(content) return render_template('cms_view_post.html', post=post)
def parse_summary(self, summary, ref): """处理文章内容,去除多余标签并处理图片地址""" soup = BeautifulSoup(summary) for span in list(soup.findAll(attrs={"style": "display: none;"})): span.extract() for attr in self.remove_attributes: for x in soup.findAll(attrs={attr: True}): del x[attr] for tag in soup.findAll(self.remove_tags): tag.extract() img_count = 0 images = [] for img in list(soup.findAll('img')): if (krconfig.max_image_per_article >= 0 and img_count >= krconfig.max_image_per_article) \ or img.has_key('src') is False : img.extract() else: try: if img['src'].encode('utf-8').lower().endswith( ('jpg', 'jpeg', 'gif', 'png', 'bmp')): localimage, fullname = self.parse_image(img['src']) # 确定结尾为图片后缀,防止下载非图片文件(如用于访问分析的假图片) if os.path.isfile(fullname) is False: images.append({ 'url': img['src'], 'filename': fullname, 'referer': ref }) if localimage: img['src'] = localimage img_count = img_count + 1 else: img.extract() else: img.extract() except Exception, e: logging.info("error: %s" % e) img.extract()
def parse_summary(self, summary, ref): """处理文章内容,去除多余标签并处理图片地址""" soup = BeautifulSoup(summary) for span in list(soup.findAll(attrs = { "style" : "display: none;" })): span.extract() for attr in self.remove_attributes: for x in soup.findAll(attrs = {attr:True}): del x[attr] for tag in soup.findAll(self.remove_tags): tag.extract() img_count = 0 images = [] for img in list(soup.findAll('img')): if (krconfig.max_image_per_article >= 0 and img_count >= krconfig.max_image_per_article) \ or img.has_key('src') is False : img.extract() else: try: if img['src'].encode('utf-8').lower().endswith(('jpg', 'jpeg', 'gif', 'png', 'bmp')): localimage, fullname = self.parse_image(img['src']) # 确定结尾为图片后缀,防止下载非图片文件(如用于访问分析的假图片) if os.path.isfile(fullname) is False: images.append({ 'url':img['src'], 'filename':fullname, 'referer':ref }) if localimage: img['src'] = localimage img_count = img_count + 1 else: img.extract() else: img.extract() except Exception, e: logging.info("error: %s" % e) img.extract()
def get_refresh_url(page_content): try: page_soup = BeautifulSoup(page_content) for meta_tag in page_soup.findAll('meta'): if meta_tag['http-equiv'].lower() == 'refresh': refresh_url = meta_tag['content'].split('URL=')[1] return refresh_url except: pass return None
def assert_warning_message_in_response(self, response, message=""): """Check if response contains one or more warning messages. Assume warning messages rendered as <p class="alert-warning"> elements. """ soup = BeautifulSoup(response) alert = soup.findAll("p", "alert-warning") self.assertGreater(len(alert), 0, "no warning message found in response") if message: found = str(alert[0]).find(message) self.assertGreater(found, 0)
def parse_page(writer, catalogue, page=1): print 'Parsing page %s' % page url = urllib.urlopen(URL % (catalogue, page)) soup = BeautifulSoup(url) table = soup.find('table', attrs={'class': 'snippets'}) for tr in table.findAll('tr'): # get name of the page name = tr.td.h4.a.string # get URL of the page url = tr.td.h4.a['href'].encode('utf-8') #get stats info stats = '?' stats_element = tr.find('p', attrs={'class': 'Stats'}) if stats_element: stats = stats_element.strong.nextSibling.string[1:-11].replace(' ', '') if stats == 'wtrakc': stats = '?' # get price price = tr.find('td', attrs={'class': 'Price'}).strong.string[0:-12] # calculate CPM cpm = '?' try: cpm = (float(price)*30) / int(stats) * 1000 except: cpm = '?' # write to the file row = [name, url, stats, price.replace('.', ','), str(cpm).replace('.', ',')] print row writer.writerow(row) # find last page of the catalogue anchors = soup.findAll('a', href=re.compile('/networks/[0-9]+/websites\?page=[0-9]+')) if not anchors: return pages = [] for anchor in anchors: number = re.match('/networks/[0-9]+/websites\?page=([0-9]+)', anchor['href']).group(1) pages.append(int(number)) pages.sort() last = pages[-1] # parse next page if exists if last > page: next = page + 1 parse_page(writer, catalogue, next)
def assert_has_div_with_ID(self, response, id_attr): """Check if response contains a Div with a particular ID attribute. <div id="<some-id>"> elements. """ soup = BeautifulSoup(response) alert = soup.findAll("div", id=id_attr) if alert: self.assertGreater(len(alert), 0, "No Div tag with, id=%s, in response" % str(id_attr)) else: self.fail("No Div tag with, id=%s, in response" % str(id_attr))
def strip_professors(html, name): """Returns list of professor matches""" profs = [] table = BeautifulSoup(html).find('div', {'id': 'ratingTable'}) if table is None: logging.debug(html[500:]) return profs split = name[:-1].upper().split(',') qLast = split[0] try: qFirst = split[1] except: qFirst = '' rows = table.findAll('div', {'class': re.compile(r"entry (odd|even)")}) for row in rows: divName = row.find('div', {'class': 'profName'}) anchor = divName.find('a') profName = unicode(anchor.renderContents().strip(), 'utf-8', 'ignore').upper() try: firstName = profName.split(',')[1] except: firstName = '' # logging.info('Searching against: ' + profName) if profName.startswith(qLast) and qFirst in firstName: href = 'http://www.ratemyprofessors.com/' + anchor['href'].strip() profDept = row.find('div', {'class': 'profDept'}).renderContents().strip() profRatings = row.find('div', {'class': 'profRatings'}).renderContents().strip() profQuality = row.find('div', {'class': 'profAvg'}).renderContents().strip() profEasiness = row.find('div', {'class': 'profEasy'}).renderContents().strip() profHot = row.find('div', {'class': re.compile(r".*\bprofHot\b.*")}).renderContents().strip() if profHot == 'Hot': profHot = '✓' else: profHot = ' ' profs.append({ 'name': profName, 'href': href, 'dept': profDept, 'ratings': profRatings, 'quality': profQuality, 'easiness': profEasiness, 'hot': profHot }) return profs
def league_settings(league_id, access_code): response = urlfetch.fetch("http://football.fantasysports.yahoo.com/f1/%s/settings?pak=%s" % (league_id, access_code)) settings_table_soup = BeautifulSoup(response.content).find("table", attrs={'id': 'settings-table'}) positions = defaultdict(int) for p in [str(s.strip()) for s in settings_table_soup.findAll('tr')[23].find('td', attrs={'width': '410'}).b.contents[0].split(',')]: positions[p] += 1 #bench_spots = roster_positions.count('BN') return positions
def strip_search(html): form_html = BeautifulSoup(html).find('form', action='http://websoc.reg.uci.edu/') #replace form submit with our own link form_html['action'] = '/schedules' #remove 'Display Text Results' button text_buttons = form_html.findAll(attrs={"class" : "banner-width"}) for i in text_buttons: i.replaceWith('<p id=\"submit-container\"><input type="submit" value="Display Results" name="Submit"></p>') return str(form_html)
def get_playlist(path): """Fetches the playlist for a DJ set""" #parse the playlist playlistXml = retrieve_url(BASE_URL+get_asset_path(path)+'.xml') soup = BeautifulSoup(playlistXml) tracks = soup.findAll('track') playlist = [] for track in tracks : playlist.append({"title":unescape(track.title.string), "artist":unescape(track.creator.string), "file":track.location.string, "start":track.meta.nextSibling}) return playlist
def get_sets(): """Retrieves the most popular sets from Mugasha""" # usock = open(BASE_CURRENT_SOURCE_PATH, "r") html = retrieve_url(BASE_URL+"/browse/sets") soup = BeautifulSoup(html) setsHtml = soup.findAll('div', attrs={"class" : "setTabs set-sel-weekly"}) sets = [] for setHtml in setsHtml: anchors = setHtml.findAll('a') sets.append({"title":anchors[1].string, "thumb_url":anchors[2].img['src'], "browse_path":anchors[1]['href']}) return sets
def parse_matchup_info(league, team_id): logging.info("team_id: %d" % team_id) generic_matchup_url = build_url(league_id=league.id, page='matchup', params={'mid1': team_id}, access_code=league.access_code) try: matchup_soup = BeautifulSoup(urlfetch.fetch(generic_matchup_url).content).find('div', attrs={'class': 'scoreboard'}).find('li') except: matchup_soup = None logging.info("\n\n\n%s\n\n\n" % matchup_soup.prettify()) if matchup_soup: team_names = [str(row.find('a').contents[0]).strip() for row in matchup_soup.findAll('tr')] score = [float(pts.contents[0]) for pts in matchup_soup.findAll('td', attrs={'class': 'pts'})] else: team_names = None score = None if team_names and score: return {'score': score, 'team_names': team_names} else: return None
def strip_professors(html, name): table = BeautifulSoup(html).find('div', {'id': 'ratingTable'}) if table is None: logging.debug(html[500:]) return get_rmp_error('Parse Error','Could not find "ratingTable" at RateMyProfessors.com') else: profs = list() #name = name.upper() split = name.split(','); qLastName = split[0].strip() qFirstName = split[1].strip() if (qFirstName == None or qFirstName == ''): qFirstName = '!' rows = table.findAll('div', {'class': re.compile(r".*\bentry\b.*")}) for row in rows: divName = row.find('div', {'class': 'profName'}) anchor = divName.find('a') profName = unicode(anchor.renderContents().strip(), 'utf-8', 'ignore').upper() split = profName.split(','); lastName = split[0].strip() firstName = split[1].strip() if (firstName == None or firstName == ''): firstName = '!' #logging.debug(qLastName + ' =? ' + lastName + ' && ' + qFirstName + ' =? ' + firstName) if lastName == qLastName and firstName[0] == qFirstName[0]: href = 'http://www.ratemyprofessors.com/' + anchor['href'].strip() profDept = row.find('div', {'class': 'profDept'}).renderContents().strip() profRatings = row.find('div', {'class': 'profRatings'}).renderContents().strip() profQuality = row.find('div', {'class': 'profAvg'}).renderContents().strip() profEasiness = row.find('div', {'class': 'profEasy'}).renderContents().strip() profHot = row.find('div', {'class': re.compile(r".*\bprofHot\b.*")}).renderContents().strip() if profHot == 'Hot': profHot = '✓' else: profHot = ' ' prof = { 'name': profName, 'href': href, 'dept': profDept, 'ratings': profRatings, 'quality': profQuality, 'easiness': profEasiness, 'hot': profHot } #logging.debug(prof) profs.append(prof) return json.dumps(profs)
def assert_error_message_in_response(self, response, message=""): """Check if response contains one or more error messages. Assume error messages rendered as <p class="alert-error"> elements. """ soup = BeautifulSoup(response) # logging.info(soup) alert = soup.findAll("p", "alert-error") logging.info(alert) if len(alert) > 0: pass else: self.fail("no error message found in response") if message: found = str(alert[0]).find(message) self.assertGreater(found, 0)
def get(self): self.response.headers['Content-Type'] = 'text/plain' day = date.today() - relativedelta(days=1) response = fetch_url("http://podcastrss.play.it/the-sports-junkies_mp3_128.xml") if response and response.status_code == 200: feed_soup = BeautifulSoup(response.content) [copyright_el.extract() for copyright_el in feed_soup.findAll("copyright")] self.response.out.write("%s\n\n\n" % feed_soup.prettify()) DailyFeedSnapshot.create(day, feed_soup.prettify()) msg = "Created a DailyFeedSnapshot for %s." % (day) self.response.out.write(msg) logging.info(msg) else: msg = "Could not create a DailyFeedSnapshot for %s." % (day) self.response.out.write(msg) logging.error(msg)
def clawdata(data): data = urllib.urlencode(data) url = "http://www.powerball.com/powerball/pb_nbr_history.asp" response = urllib2.urlopen(url, data) soup = BeautifulSoup(response) for tag in soup.findAll(valign="middle"): csoup = BeautifulSoup(str(tag)) dictIssue = dict() dictIssue["issueDate"] = "" dictIssue["luckNum"] = []; if csoup.tr != None: for tag in csoup.tr.findAll('td'): if re.search("[0-9]+\/[0-9]+\/[0-9]{4}", str(tag.text)): dictIssue["issueDate"] = str(tag.text) elif str(tag.text) != " ": dictIssue["luckNum"].append(int(tag.text)) print dictIssue
def clawdata(data): data = urllib.urlencode(data) url = "http://www.powerball.com/powerball/pb_nbr_history.asp" response = urllib2.urlopen(url, data) soup = BeautifulSoup(response) for tag in soup.findAll(valign="middle"): csoup = BeautifulSoup(str(tag)) dictIssue = dict() dictIssue["issueDate"] = "" dictIssue["luckNum"] = [] if csoup.tr != None: for tag in csoup.tr.findAll('td'): if re.search("[0-9]+\/[0-9]+\/[0-9]{4}", str(tag.text)): dictIssue["issueDate"] = str(tag.text) elif str(tag.text) != " ": dictIssue["luckNum"].append(int(tag.text)) print dictIssue
def parse_organic_contents(raw_content, organic_pos): data_dict = {} data_dict['position'] = organic_pos b = BeautifulSoup(raw_content) rtitle = b.find('a') headline = p.sub('', str(rtitle)) data_dict['title'] = headline display_url = parse_display_url(str(raw_content)) data_dict['display_url'] = display_url rhref = b.find('a', href=True) url = str(rhref['href']) data_dict['url'] = ul.unquote(url) rtext = b.findAll('div', {'class': 's'}) text = p.sub('', str(rtext)) data_dict['text'] = text.replace(']', '').replace('[', '') return data_dict
def getViewCount(songTitle): try: youtube = 'http://gdata.youtube.com/feeds/api/videos?v=2&max-results=1&q=' #songTitle = urllib2.quote(songTitle) #print songTitle url = youtube + songTitle #print url web = urllib2.urlopen(url) content = web.read() web.close() soup = BeautifulSoup(content) stats = soup.findAll('yt:statistics') return int(stats[0]['viewcount']) except: return 0
def TPB(book=None, test=False): errmsg = '' provider = "TPB" host = lazylibrarian.CONFIG['TPB_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host + "/s/?") cat = 0 # 601=ebooks, 102=audiobooks, 0=all, no mag category if 'library' in book: if book['library'] == 'AudioBook': cat = 102 elif book['library'] == 'eBook': cat = 601 elif book['library'] == 'magazine': cat = 0 sterm = makeUnicode(book['searchterm']) page = 0 results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 next_page = True while next_page: params = { "q": book['searchterm'], "category": cat, "page": page, "orderby": "99" } searchURL = providerurl + "?%s" % urllib.urlencode(params) next_page = False result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) errmsg = result result = False if test: return success if result: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) soup = BeautifulSoup(result) # tpb uses a named table table = soup.find('table', id='searchResult') if table: rows = table.findAll('tr') else: rows = [] if len(rows) > 1: rows = rows[1:] # first row is headers for row in rows: td = row.findAll('td') if len(td) > 2: try: title = unaccented( str(td[1]).split('title=')[1].split('>')[1].split( '<')[0]) magnet = str(td[1]).split('href="')[1].split('"')[0] size = unaccented( td[1].text.split(', Size ')[1].split('iB')[0]) size = size.replace(' ', '') mult = 1 try: if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(td[2].text) except ValueError: seeders = 0 if minimumseeders < int(seeders): # no point in asking for magnet link if not enough seeders magurl = '%s/%s' % (host, magnet) result, success = fetchURL(magurl) if not success: logger.debug('Error fetching url %s, %s' % (magurl, result)) else: magnet = None new_soup = BeautifulSoup(result) for link in new_soup.findAll('a'): output = link.get('href') if output and output.startswith('magnet'): magnet = output break if not magnet or not title: logger.debug('Missing magnet or title') else: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': magnet, 'tor_size': str(size), 'tor_type': 'magnet', 'priority': lazylibrarian.CONFIG['TPB_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) next_page = True else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) page += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page: logger.warn( 'Maximum results page search reached, still more results available' ) next_page = False logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def GEN(book=None, prov=None): errmsg = '' provider = "libgen.io" if prov is None: prov = 'GEN' host = lazylibrarian.CONFIG[prov + '_HOST'] if not host.startswith('http'): host = 'http://' + host search = lazylibrarian.CONFIG[prov + '_SEARCH'] if not search or not search.endswith('.php'): search = 'search.php' if 'index.php' not in search and 'search.php' not in search: search = 'search.php' if search[0] == '/': search = search[1:] page = 1 results = [] next_page = True while next_page: if 'index.php' in search: params = { "s": book['searchterm'], "f_lang": "All", "f_columns": 0, "f_ext": "All" } else: params = { "view": "simple", "open": 0, "phrase": 0, "column": "def", "res": 100, "req": book['searchterm'] } if page > 1: params['page'] = page providerurl = url_fix(host + "/%s" % search) searchURL = providerurl + "?%s" % urllib.urlencode(params) next_page = False result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug(u"No results found from %s for %s" % (provider, book['searchterm'])) elif '111' in result: # looks like libgen has ip based access limits logger.error( 'Access forbidden. Please wait a while before trying %s again.' % provider) errmsg = result else: logger.debug(searchURL) logger.debug('Error fetching page data from %s: %s' % (provider, result)) errmsg = result result = False if result: logger.debug(u'Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) try: soup = BeautifulSoup(result) try: table = soup.findAll('table')[2] # un-named table if table: rows = table.findAll('tr') except IndexError: # no results table in result page rows = [] if 'search.php' in search and len(rows) > 1: rows = rows[1:] for row in rows: author = '' title = '' size = '' extn = '' link = '' td = row.findAll('td') if 'index.php' in search and len(td) > 3: try: res = str( BeautifulStoneSoup( td[0].text, convertEntities=BeautifulStoneSoup. HTML_ENTITIES)) author = formatAuthorName(res) title = str( BeautifulStoneSoup( td[2].text, convertEntities=BeautifulStoneSoup. HTML_ENTITIES)) temp = str(td[4]) temp = temp.split('onmouseout')[1] extn = temp.split('">')[1].split('(')[0] size = temp.split('">')[1].split('(')[1].split( ')')[0] size = size.upper() link = temp.split('href=')[1].split('"')[1] except IndexError as e: logger.debug( 'Error parsing libgen index.php results: %s' % str(e)) elif 'search.php' in search and len(td) > 8: try: res = str( BeautifulStoneSoup( td[1].text, convertEntities=BeautifulStoneSoup. HTML_ENTITIES)) author = formatAuthorName(res) title = str( td[2]).split('>')[2].split('<')[0].strip() title = str( BeautifulStoneSoup( title, convertEntities=BeautifulStoneSoup. HTML_ENTITIES)) link = str(td[2]).split('href="')[1].split( '?')[1].split('"')[0] size = unaccented(td[7].text).upper() extn = td[8].text except IndexError as e: logger.debug( 'Error parsing libgen search.php results; %s' % str(e)) if not size: size = 0 else: try: mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 if link and title: if author: title = author.strip() + ' ' + title.strip() if extn: title = title + '.' + extn if not link.startswith('http'): if "/ads.php?" in link: url = url_fix(host + link) else: url = url_fix(host + "/ads.php?" + link) else: url = redirect_url(host, link) bookresult, success = fetchURL(url) if not success: # may return 404 if no results, not really an error if '404' in bookresult: logger.debug( u"No results found from %s for %s" % (provider, book['searchterm'])) else: logger.debug(url) logger.debug( 'Error fetching link data from %s: %s' % (provider, bookresult)) errmsg = bookresult bookresult = False if bookresult: url = None try: new_soup = BeautifulSoup(bookresult) for link in new_soup.findAll('a'): output = link.get('href') if output: if output.startswith( 'http' ) and '/get.php' in output: url = output break elif '/get.php' in output: url = '/get.php' + output.split( '/get.php')[1] break elif '/download/book' in output: url = '/download/book' + output.split( '/download/book')[1] break if url and not url.startswith('http'): url = url_fix(host + url) else: url = redirect_url(host, url) except Exception as e: logger.debug( 'Error parsing bookresult for %s: %s' % (link, str(e))) url = None if url: results.append({ 'bookid': book['bookid'], 'tor_prov': provider + '/' + search, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': 'direct', 'priority': lazylibrarian.CONFIG[prov + '_DLPRIORITY'] }) logger.debug('Found %s, Size %s' % (title, size)) next_page = True except Exception as e: logger.error(u"An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) page += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page: logger.warn( 'Maximum results page search reached, still more results available' ) next_page = False logger.debug( u"Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, book['searchterm'])) return results, errmsg
########################start of main################################### for i in range(startId, endId): url = "http://lyrics.oiktv.com/singer.php?sid=" + str(i) #lyricsWeb = urllib2.urlopen("http://lyrics.oiktv.com/singer.php?sid=51") lyricsWeb = urllib2.urlopen(url) webContent = lyricsWeb.read() lyricsWeb.close() soup = BeautifulSoup(webContent) pages = soup.findAll('a') wantedPages = [] for page in pages: if re.search("&page=", page['href']): #print page['href'] url = page['href'] wantedPages.append(url) if len(wantedPages) > 1: #find those who has more than 20 albums maxPageNum = 1 #Max 1 page for each singer pageNum = 0 maxSongNum = 250 songNum = 0 fetchNum = 0
# -*- coding:utf-8 -*- import re import urllib2 from lib.BeautifulSoup import BeautifulSoup agent="""Sosospider+(+http://help.soso.com/webspider.htm)""" blog_url = 'http://blog.sina.com.cn/s/articlelist_1517582220_0_1.html' spider_handle = urllib2.urlopen(blog_url) blog_content = spider_handle.read() soup = BeautifulSoup(blog_content, fromEncoding='utf-8') item_list = soup.findAll('span', {'class':'atc_title'}) urls = ['http://blog.csdn.net/heiyeshuwu/archive/2010/12/19/6085876.aspx'] #for item in item_list: # urls.append(item.a['href']) for url in urls: request = urllib2.Request(url) request.add_header('User-Agent', agent) handle = urllib2.urlopen(request).read() article_soup = BeautifulSoup(handle, fromEncoding='utf-8') title = article_soup.find('h1',{'class':'title_txt'}) content = article_soup.find('div',{'id':'sina_keyword_ad_area2'}) # tmp = [] # for c in content.contents: # print type(c) # tmp.append(c.__str__('utf-8')) print url print title.contents print title.contents[2].replace('\t', '').replace('\r\n', '')
def KAT(book=None, test=False): errmsg = '' provider = "KAT" host = lazylibrarian.CONFIG['KAT_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host + "/usearch/" + urllib.quote(book['searchterm'])) params = {"category": "books", "field": "seeders", "sorder": "desc"} searchURL = providerurl + "/?%s" % urllib.urlencode(params) sterm = makeUnicode(book['searchterm']) result, success = fetchURL(searchURL) if not success: # seems KAT returns 404 if no results, not really an error if '404' in result: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) errmsg = result result = False if test: return success results = [] if result: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 soup = BeautifulSoup(result) rows = [] try: table = soup.findAll('table')[1] # un-named table if table: rows = table.findAll('tr') except IndexError: # no results table in result page rows = [] if len(rows) > 1: rows = rows[1:] # first row is headers for row in rows: td = row.findAll('td') if len(td) > 3: try: title = unaccented( str(td[0]).split('cellMainLink">')[1].split('<')[0]) # kat can return magnet or torrent or both. magnet = '' url = '' mode = 'torrent' try: magnet = 'magnet' + str( td[0]).split('href="magnet')[1].split('"')[0] mode = 'magnet' except IndexError: pass try: url = 'http' + str(td[0]).split('href="http')[1].split( '.torrent?')[0] + '.torrent' mode = 'torrent' except IndexError: pass if not url or (magnet and url and lazylibrarian.CONFIG['PREFER_MAGNET']): url = magnet mode = 'magnet' try: size = str(td[1].text).replace(' ', '').upper() mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(td[3].text) except ValueError: seeders = 0 if not url or not title: logger.debug('Missing url or title') elif minimumseeders < int(seeders): results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': mode, 'priority': lazylibrarian.CONFIG['KAT_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def TDL(book=None, test=False): errmsg = '' provider = "torrentdownloads" host = lazylibrarian.CONFIG['TDL_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host) params = {"type": "search", "cid": "2", "search": book['searchterm']} searchURL = providerurl + "/rss.xml?%s" % urllib.urlencode(params) sterm = makeUnicode(book['searchterm']) data, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in data: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, data)) errmsg = data data = False if test: return success results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 if data: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) d = feedparser.parse(data) if len(d.entries): for item in d.entries: try: title = item['title'] seeders = int(item['seeders']) link = item['link'] size = int(item['size']) url = None if link and minimumseeders < int(seeders): # no point requesting the magnet link if not enough seeders # TDL gives us a relative link result, success = fetchURL(providerurl + link) if success: new_soup = BeautifulSoup(result) for link in new_soup.findAll('a'): output = link.get('href') if output and output.startswith('magnet'): url = output break if not url or not title: logger.debug('Missing url or title') else: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': 'magnet', 'priority': lazylibrarian.CONFIG['TDL_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def WWT(book=None, test=False): errmsg = '' provider = "WorldWideTorrents" host = lazylibrarian.CONFIG['WWT_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host + "/torrents-search.php") sterm = makeUnicode(book['searchterm']) cat = 0 # 0=all, 36=ebooks, 52=mags, 56=audiobooks if 'library' in book: if book['library'] == 'AudioBook': cat = 56 elif book['library'] == 'eBook': cat = 36 elif book['library'] == 'magazine': cat = 52 page = 0 results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 next_page = True while next_page: params = {"search": book['searchterm'], "page": page, "cat": cat} searchURL = providerurl + "/?%s" % urllib.urlencode(params) next_page = False result, success = fetchURL(searchURL) if not success: # might return 404 if no results, not really an error if '404' in result: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) errmsg = result result = False if test: return success if result: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) soup = BeautifulSoup(result) try: tables = soup.findAll('table') # un-named table table = tables[2] if table: rows = table.findAll('tr') except IndexError: # no results table in result page rows = [] if len(rows) > 1: rows = rows[1:] # first row is headers for row in rows: td = row.findAll('td') if len(td) > 3: try: title = unaccented( str(td[0]).split('title="')[1].split('"')[0]) # can return magnet or torrent or both. magnet = '' url = '' mode = 'torrent' try: magnet = 'magnet' + str( td[0]).split('href="magnet')[1].split('"')[0] mode = 'magnet' except IndexError: pass try: url = url_fix(host + '/download.php') + \ str(td[0]).split('href="download.php')[1].split('.torrent"')[0] + '.torrent' mode = 'torrent' except IndexError: pass if not url or (magnet and url and lazylibrarian.CONFIG['PREFER_MAGNET']): url = magnet mode = 'magnet' try: size = str(td[1].text).replace(' ', '').upper() mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(td[2].text) except ValueError: seeders = 0 if not url or not title: logger.debug('Missing url or title') elif minimumseeders < int(seeders): results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': mode, 'priority': lazylibrarian.CONFIG['WWT_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) next_page = True else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) page += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page: logger.warn( 'Maximum results page search reached, still more results available' ) next_page = False logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def view(): addon_handle = int(sys.argv[1]) addon = xbmcaddon.Addon() addonname = addon.getAddonInfo('name') args = urlparse.parse_qs(sys.argv[2][1:]) xbmcplugin.setContent(addon_handle, 'movies') cat=args.get('cat', None) page = args.get('page', None) link = args.get('link', None) catalogues=[{'label':'\x56\x69\x64\x65\x6F\x20\x4D\xE1\xBB\x9B\x69'.decode('utf-8'),'id':'video/new/'}, {'label':'Video Hot','id':'video/hot/'}] #play link if link!=None: link_video=link[0] if link_video.startswith(web_url): r = requests.get(link[0]) html = r.text #xbmc.log(html.encode('utf-8')) soup = BeautifulSoup(html) video_src=soup.find('embed', attrs={'id':'zplayer'}) video_flashvars=video_src.get('flashvars') args_video = urlparse.parse_qs(video_flashvars) link_video=args_video['file'][0] xbmc.Player().play(link_video) return #Load cats if cat==None: for cat in catalogues: li = xbmcgui.ListItem(cat['label']) urlList = CMDTools.build_url(base_url,{'web':get_Web_Name(), 'cat':cat['id']}) xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li, isFolder=True) xbmc.executebuiltin('Container.SetViewMode(501)') xbmcplugin.endOfDirectory(addon_handle) return #Load noi dung cat if cat!=None: if page==None: page=1 else: page=int(page[0]) r = requests.get(web_url+cat[0]+str(page)) html = r.text xbmc.log(html.encode('utf-8')) soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) data_List=soup.findAll('a',attrs={'class':'play'}) #load item menu for item in data_List: link_item=web_url+item.get('href') if item.get('data-youtubeid')!='': link_item="plugin://plugin.video.youtube/play/?video_id="+item.get('data-youtubeid') img_item=item.find('img') img_src=img_item.get('src') img_alt=img_item.get('alt') li = xbmcgui.ListItem(img_alt) li.setThumbnailImage(img_src) li.setInfo(type='image',infoLabels="") urlList = CMDTools.build_url(base_url,{'web':get_Web_Name(), 'link':link_item, 'type':cat[0]}) xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li) #Tao nut next li = xbmcgui.ListItem("Next") urlList=CMDTools.build_url(base_url,{'web':web_name, 'cat':cat[0],'page': page+1}); xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li, isFolder=True) xbmc.executebuiltin('Container.SetViewMode(501)') #xbmc.executebuiltin("ClearSlideshow") #xbmc.executebuiltin("SlideShow(,,notrandom)") xbmcplugin.endOfDirectory(addon_handle) return xbmcplugin.endOfDirectory(addon_handle)
def TDL(book=None): provider = "torrentdownloads" host = lazylibrarian.TDL_HOST if not str(host)[:4] == "http": host = 'http://' + host providerurl = url_fix(host) params = {"type": "search", "cid": "2", "search": book['searchterm']} searchURL = providerurl + "/rss.xml?%s" % urllib.urlencode(params) try: request = urllib2.Request(searchURL) if lazylibrarian.PROXY_HOST: request.set_proxy(lazylibrarian.PROXY_HOST, lazylibrarian.PROXY_TYPE) request.add_header('User-Agent', USER_AGENT) data = urllib2.urlopen(request, timeout=90) except (socket.timeout) as e: logger.debug('Timeout fetching data from %s' % provider) data = False except (urllib2.HTTPError, urllib2.URLError, ssl.SSLError) as e: # may return 404 if no results, not really an error if hasattr(e, 'code') and e.code == 404: logger.debug(searchURL) logger.debug(u"No results found from %s for %s" % (provider, book['searchterm'])) else: logger.debug(searchURL) if hasattr(e, 'reason'): errmsg = e.reason else: errmsg = str(e) logger.debug('Error fetching data from %s: %s' % (provider, errmsg)) data = False results = [] minimumseeders = int(lazylibrarian.NUMBEROFSEEDERS) - 1 if data: logger.debug(u'Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) d = feedparser.parse(data) if len(d.entries): for item in d.entries: try: title = item['title'] seeders = int(item['seeders']) link = item['link'] size = int(item['size']) url = None if link and minimumseeders < seeders: # no point requesting the magnet link if not enough seeders request = urllib2.Request(link) if lazylibrarian.PROXY_HOST: request.set_proxy(lazylibrarian.PROXY_HOST, lazylibrarian.PROXY_TYPE) request.add_header('User-Agent', USER_AGENT) conn = urllib2.urlopen(request, timeout=90) result = conn.read() url = None new_soup = BeautifulSoup(result) for link in new_soup.findAll('a'): output = link.get('href') if output and output.startswith('magnet'): url = output break if minimumseeders < int(seeders): if not url or not title: logger.debug('Missing url or title') else: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error(u"An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug( u"Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, book['searchterm'])) return results
def KAT(book=None): provider = "KAT" host = lazylibrarian.CONFIG['KAT_HOST'] if not str(host)[:4] == "http": host = 'http://' + host providerurl = url_fix(host + "/usearch/" + book['searchterm']) params = {"category": "books", "field": "seeders", "sorder": "desc"} searchURL = providerurl + "/?%s" % urllib.urlencode(params) result, success = fetchURL(searchURL) if not success: # seems KAT returns 404 if no results, not really an error if '404' in result: logger.debug(u"No results found from %s for %s" % (provider, book['searchterm'])) else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) result = False results = [] if result: logger.debug(u'Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 soup = BeautifulSoup(result) try: table = soup.findAll('table')[1] rows = table.findAll('tr') except Exception: # no results = no table in result page rows = [] c0 = [] c1 = [] c3 = [] if len(rows) > 1: for row in rows[1:]: if len(row.findAll('td')) > 3: c0.append(row.findAll('td')[0]) c1.append(row.findAll('td')[1]) c3.append(row.findAll('td')[3]) for col0, col1, col3 in zip(c0, c1, c3): try: title = unaccented( str(col0).split('cellMainLink">')[1].split('<')[0]) # kat can return magnet or torrent or both. magnet = '' url = '' mode = 'torrent' try: magnet = 'magnet' + str(col0).split( 'href="magnet')[1].split('"')[0] mode = 'magnet' except IndexError: pass try: url = 'http' + str(col0).split('href="http')[1].split( '.torrent?')[0] + '.torrent' mode = 'torrent' except IndexError: pass if not url or (magnet and url and lazylibrarian.CONFIG['PREFER_MAGNET']): url = magnet mode = 'magnet' try: size = str(col1.text).replace(' ', '').upper() mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(col3.text) except ValueError: seeders = 0 if not url or not title: logger.debug('Missing url or title') elif minimumseeders < seeders: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': mode }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error(u"An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug( u"Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, book['searchterm'])) return results
def TPB(book=None): provider = "TPB" host = lazylibrarian.CONFIG['TPB_HOST'] if not str(host)[:4] == "http": host = 'http://' + host providerurl = url_fix(host + "/s/?q=" + book['searchterm']) params = {"category": "601", "page": "0", "orderby": "99"} searchURL = providerurl + "&%s" % urllib.urlencode(params) result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug(u"No results found from %s for %s" % (provider, book['searchterm'])) else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) result = False results = [] if result: logger.debug(u'Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 soup = BeautifulSoup(result) try: table = soup.findAll('table')[0] rows = table.findAll('tr') except Exception: # no results = no table in result page rows = [] c1 = [] c2 = [] if len(rows) > 1: for row in rows[1:]: if len(row.findAll('td')) > 2: c1.append(row.findAll('td')[1]) c2.append(row.findAll('td')[2]) for col1, col2 in zip(c1, c2): try: title = unaccented( str(col1).split('title=')[1].split('>')[1].split('<')[0]) magnet = str(col1).split('href="')[1].split('"')[0] size = unaccented(col1.text.split(', Size ')[1].split('iB')[0]) mult = 1 try: if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(col2.text) except ValueError: seeders = 0 if minimumseeders < seeders: # no point in asking for magnet link if not enough seeders magurl = '%s/%s' % (host, magnet) result, success = fetchURL(magurl) if not success: logger.debug('Error fetching url %s, %s' % (magurl, result)) else: magnet = None new_soup = BeautifulSoup(result) for link in new_soup.findAll('a'): output = link.get('href') if output and output.startswith('magnet'): magnet = output break if not magnet or not title: logger.debug('Missing magnet or title') else: if minimumseeders < seeders: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': magnet, 'tor_size': str(size), 'tor_type': 'magnet' }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error(u"An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug( u"Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, book['searchterm'])) return results
def GEN(book=None): provider = "libgen" host = lazylibrarian.CONFIG['GEN_HOST'] if not str(host)[:4] == "http": host = 'http://' + host searchURL = url_fix( host + "/search.php?view=simple&open=0&phrase=0&column=def&res=100&req=" + book['searchterm']) result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug(u"No results found from %s for %s" % (provider, book['searchterm'])) elif '111' in result: # looks like libgen has ip based access limits logger.error( 'Access forbidden. Please wait a while before trying %s again.' % provider) else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) result = False results = [] if result: logger.debug(u'Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) soup = BeautifulSoup(result) try: table = soup.findAll('table')[2] rows = table.findAll('tr') except Exception: # no results = no table in result page rows = [] c1 = [] c2 = [] c7 = [] c8 = [] if len(rows) > 1: for row in rows[1:]: if len(row.findAll('td')) > 8: c1.append(row.findAll('td')[1]) c2.append(row.findAll('td')[2]) c7.append(row.findAll('td')[7]) c8.append(row.findAll('td')[8]) for col1, col2, col7, col8 in zip(c1, c2, c7, c8): try: author = unaccented(col1.text) title = unaccented( str(col2).split('>')[2].split('<')[0].strip()) link = str(col2).split('href="')[1].split('?')[1].split('"')[0] size = unaccented(col7.text).upper() extn = col8.text try: mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 if link and title: if author: title = author.strip() + ' ' + title.strip() if extn: title = title + '.' + extn bookURL = url_fix(host + "/ads.php?" + link) bookresult, success = fetchURL(bookURL) if not success: # may return 404 if no results, not really an error if '404' in bookresult: logger.debug(u"No results found from %s for %s" % (provider, book['searchterm'])) else: logger.debug(bookURL) logger.debug('Error fetching data from %s: %s' % (provider, bookresult)) bookresult = False if bookresult: url = None new_soup = BeautifulSoup(bookresult) for link in new_soup.findAll('a'): output = link.get('href') if output and output.startswith('/get.php'): url = output break if url: url = url_fix(host + url) results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': 'direct' }) logger.debug('Found %s, Size %s' % (title, size)) except Exception as e: logger.error(u"An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug( u"Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, book['searchterm'])) return results