Beispiel #1
0
def get_album_page(sub_path, page_count):
    album_pagination_url = "http://www.88mmw.com/%s/list_%s_%s.html" % (
        sub_path, SUB_PATH_LIST[sub_path], page_count)
    album_pagination_response = net.http_request(album_pagination_url,
                                                 method="GET")
    result = {
        "album_info_list": [],  # 全部图集信息
        "is_over": False,  # 是不是最后一页图集
    }
    if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(album_pagination_response.status))
    # 页面编码
    album_pagination_html = album_pagination_response.data.decode("GBK")
    # 获取图集信息,存在两种页面样式
    album_list_selector = PQ(album_pagination_html).find("div.xxx li a")
    if album_list_selector.length == 0:
        album_list_selector = PQ(album_pagination_html).find("div.yyy li a")
    if album_list_selector.length == 0:
        raise crawler.CrawlerException("页面截取图集列表失败\n%s" %
                                       album_pagination_html.encode("UTF-8"))
    for album_index in range(0, album_list_selector.length):
        result_album_info = {
            "album_title": "",  # 图集id
            "page_id": None,  # 图集页面id
        }
        album_selector = album_list_selector.eq(album_index)
        # 获取图集id
        album_url = album_selector.attr("href")
        if not album_url:
            raise crawler.CrawlerException(
                "图集列表截取图集地址失败\n%s" % album_selector.html().encode("UTF-8"))
        album_id = album_url.split("/")[-2]
        if not crawler.is_integer(album_id):
            raise crawler.CrawlerException("图集地址截取图集id失败\n%s" % str(album_url))
        result_album_info["page_id"] = album_id
        # 获取图集标题
        album_title = album_selector.attr("title").encode("UTF-8")
        if len(re.findall("_共\d*张", album_title)) == 1:
            result_album_info["album_title"] = album_title[:album_title.
                                                           rfind("_共")]
        else:
            result_album_info["album_title"] = album_title
        result["album_info_list"].append(result_album_info)
    # 判断是不是最后一页
    max_page_info = PQ(album_pagination_html).find("div.page a").eq(-1).text()
    if not max_page_info:
        raise crawler.CrawlerException("总页数信息截取失败\n%s" %
                                       album_pagination_html.encode("UTF-8"))
    max_page_count = tool.find_sub_string(max_page_info.encode("UTF-8"), "共",
                                          "页")
    if not crawler.is_integer(max_page_count):
        raise crawler.CrawlerException("总页数截取失败\n%s" %
                                       max_page_info.encode("UTF-8"))
    result["is_over"] = page_count >= int(max_page_count)
    return result
Beispiel #2
0
def get_album_page(album_id):
    album_url = "http://www.ugirls.com/Content/List/Magazine-%s.html" % album_id
    album_response = net.http_request(album_url, method="GET")
    result = {
        "image_url_list": [],  # 全部图片地址
        "is_delete": False,  # 是不是已经被删除
        "model_name": "",  # 模特名字
    }
    if album_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(album_response.status))
    if album_response.data.find("该页面不存在,或者已经被删除!") >= 0:
        result["is_delete"] = True
        return result
    # 获取模特名字
    model_name = PQ(album_response.data).find(
        "div.ren_head div.ren_head_c a").attr("title")
    if not model_name:
        raise crawler.CrawlerException("模特信息截取模特名字失败\n%s" %
                                       album_response.data)
    result["model_name"] = model_name.encode("UTF-8").strip()
    # 获取所有图片地址
    image_list_selector = PQ(album_response.data).find("ul#myGallery li img")
    if image_list_selector.length == 0:
        raise crawler.CrawlerException("页面匹配图片地址失败\n%s" % album_response.data)
    for image_index in range(0, image_list_selector.length):
        image_url = image_list_selector.eq(image_index).attr("src")
        if image_url.find("_magazine_web_m.") == -1:
            raise crawler.CrawlerException("图片地址不符合规则\n%s" % image_url)
        result["image_url_list"].append(
            image_url.replace("_magazine_web_m.", "_magazine_web_l."))
    return result
Beispiel #3
0
def  _dump_slide(slide, idx_slide, outputdir):

    html = PyQuery(slide).html();
    slide_name = '%03d.html' % idx_slide
    print "dump slide {} in dir {}".format(idx_slide, outputdir)    
    dump = open(os.path.join(outputdir,slide_name), 'w+')
    dump.write("@template:content_bare\n")
    dump.write(html.encode('utf-8','replace'))
    dump.close()
Beispiel #4
0
	def getData(self, selector):
		"""
		Return the all text in the area limitet by the selector
		"""
		tags = self.sorceCode.find(selector)
		text = PQ(tags.html()).text()
		text = text.encode(self.encoding, 'xmlcharrefreplace')
		#print text

		return text
Beispiel #5
0
def get_album_page(album_id):
    page_count = max_page_count = 1
    result = {
        "album_title": "",  # 图集标题
        "image_url_list": [],  # 全部图片地址
        "is_delete": False,  # 是不是已经被删除
    }
    while page_count <= max_page_count:
        album_pagination_url = "http://www.youzi4.cc/mm/%s/%s_%s.html" % (
            album_id, album_id, page_count)
        album_pagination_response = net.http_request(album_pagination_url,
                                                     method="GET")
        if album_pagination_response.status == 404 and page_count == 1:
            result["is_delete"] = True
            return result
        if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
            raise crawler.CrawlerException(
                "第%s页 " % page_count +
                crawler.request_failre(album_pagination_response.status))
        # 判断图集是否已经被删除
        if page_count == 1:
            # 获取图集标题
            album_title = PQ(album_pagination_response.data.decode(
                "UTF-8")).find("meta[name='description']").attr("content")
            if not album_title:
                raise crawler.CrawlerException("页面截取标题失败\n%s" %
                                               album_pagination_response.data)
            result["album_title"] = album_title.encode("UTF-8")
        # 获取图集图片地址
        image_list_selector = PQ(
            album_pagination_response.data).find("div.articleV4Body a img")
        if image_list_selector.length == 0:
            raise crawler.CrawlerException(
                "第%s页 页面匹配图片地址失败\n%s" %
                (page_count, album_pagination_response.data))
        for image_index in range(0, image_list_selector.length):
            result["image_url_list"].append(
                str(image_list_selector.eq(image_index).attr("src")))
        # 获取总页数
        pagination_list_selector = PQ(
            album_pagination_response.data).find("ul.articleV4Page a.page-a")
        if pagination_list_selector.length > 0:
            for pagination_index in range(0, pagination_list_selector.length):
                temp_page_count = pagination_list_selector.eq(
                    pagination_index).html()
                if crawler.is_integer(temp_page_count):
                    max_page_count = max(int(temp_page_count), max_page_count)
        else:
            if page_count > 1:
                raise crawler.CrawlerException(
                    "第%s页 页面匹配分页信息失败\n%s" %
                    (page_count, album_pagination_response.data))
        page_count += 1
    return result
Beispiel #6
0
 def qidian(self):
     #list url
     if isinstance(self.url, list):
         for url in self.url:
             try:
                 bookname = PyQuery(requests.get(url).content)(
                     'h1 > em').text().strip().replace(" ", "")
                 name = bookname.encode("utf-8") + ".epub"
                 self.exists(name)
                 bookid = re.search("\d+", url).group()
                 download_url = "http://download.qidian.com/epub/%s.epub" % (
                     bookid)
                 content = requests.get(download_url).content
                 self.save(name, content)
             except Exception, e:
                 logging.warning("download error [%s]" % (url))
Beispiel #7
0
	server.sendmail(mailFrom, rcptToList, message.as_string())
	server.quit()

if '__main__' == __name__:
	configFile = 'config.cfg'
	novels = PyQuery(filename = configFile)
	message = ''
	for novel in novels('novel'):
		name = PyQuery(novel)('name').text()
		url = PyQuery(novel)('url').text()
		prefix = PyQuery(novel)('prefix').text()
		next = int(PyQuery(novel)('next').text())
		rcptToList = []
		for addr in PyQuery(novel)('emails>email'):
			rcptToList.append(PyQuery(addr).text())
		print rcptToList
		html = PyQuery(url = url)
		nextUrl = None
		for i in html('div.threadlist_title.pull_left.j_th_tit.member_thread_title_frs > a.j_th_tit'):
			if i.text.find(number2chinese(next)) != -1:
				nextUrl = prefix + PyQuery(i).attr('href')
				break
		if nextUrl:
			next += 1
			PyQuery(novel)('next').text(str(next))
			text = PyQuery(url=nextUrl)('cc:first > div:first').html()
			text = text.replace(u'<br/>', '\n').strip()
			subject = name + u' ' + u'第'+unicode(str(next))+u'章'
			send_mail('*****@*****.**', rcptToList, subject.encode('utf8'), text.encode('utf8'))
	open(configFile, 'wt').write(str(novels))
Beispiel #8
0
def get_album_page(album_id):
    page_count = max_page_count = 1
    image_count = 0
    result = {
        "album_title": "",  # 图集标题
        "image_url_list": [],  # 全部图片地址
        "is_delete": False,  # 是不是已经被删除
    }
    while page_count <= max_page_count:
        album_pagination_url = "https://www.nvshens.com/g/%s/%s.html" % (
            album_id, page_count)
        album_pagination_response = net.http_request(album_pagination_url,
                                                     method="GET")
        if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
            raise crawler.CrawlerException(
                "第%s页 " % page_count +
                crawler.request_failre(album_pagination_response.status))
        # 判断图集是否已经被删除
        if page_count == 1:
            result["is_delete"] = album_pagination_response.data.find(
                "<title>该页面未找到-宅男女神</title>") >= 0
            if result["is_delete"]:
                return result
            # 获取图集图片总数
            album_info = PQ(
                album_pagination_response.data).find("#dinfo span").text()
            if not album_info and album_info.encode("UTF-8").find("张照片") == -1:
                raise crawler.CrawlerException("页面截取图片总数信息失败\n%s" %
                                               album_pagination_response.data)
            image_count = album_info.encode("UTF-8").replace("张照片", "")
            if not crawler.is_integer(image_count):
                raise crawler.CrawlerException("页面截取图片总数失败\n%s" %
                                               album_pagination_response.data)
            image_count = int(image_count)
            if image_count == 0:
                result["is_delete"] = True
                return result
            # 获取图集标题
            result["album_title"] = str(
                tool.find_sub_string(album_pagination_response.data,
                                     '<h1 id="htilte">', "</h1>")).strip()
            if not result["album_title"]:
                raise crawler.CrawlerException("页面截取标题失败\n%s" %
                                               album_pagination_response.data)
        # 获取图集图片地址,存在两种页面样式
        image_list_selector = PQ(
            album_pagination_response.data).find("#hgallery img")
        if image_list_selector.length == 0:
            image_list_selector = PQ(
                album_pagination_response.data).find("#pgallery img")
        if image_list_selector.length == 0:
            raise crawler.CrawlerException(
                "第%s页 页面匹配图片地址失败\n%s" %
                (page_count, album_pagination_response.data))
        for image_index in range(0, image_list_selector.length):
            result["image_url_list"].append(
                str(image_list_selector.eq(image_index).attr("src")))
        # 获取总页数
        pagination_html = PQ(
            album_pagination_response.data).find("#pages").html()
        if pagination_html:
            page_count_find = re.findall(
                '/g/' + str(album_id) + '/([\d]*).html', pagination_html)
            if len(page_count_find) != 0:
                max_page_count = max(map(int, page_count_find))
            else:
                log.error("图集%s 第%s页分页异常" % (album_id, page_count))
        page_count += 1
    # 判断页面上的总数和实际地址数量是否一致
    if image_count != len(result["image_url_list"]):
        raise crawler.CrawlerException(
            "页面截取的图片数量 %s 和显示的总数 %s 不一致" %
            (image_count, len(result["image_url_list"])))
    return result
Beispiel #9
0
def prn_tbl_sec(index, node) :
	global node_id, curr_dep, last_dep, depth, opTyp
	if index != 0 :
		print >>sys.stderr,"...Start of PART, depth="+str(depth)
		ce = PyQuery(node)
    	# Print the part heading as containing node
		partLst = ce.prevAll('h3')
		partTxt = PyQuery(partLst[len(partLst)-1]).text()
		if index % 2 == 0 :
			print '<node CREATED="1347382439772" ID="PartID_'+str(index)+'" POSITION="left" MODIFIED="1347382510988" TEXT="'+partTxt.encode('utf-8')+'">'
		else :
			print '<node CREATED="1347382439772" ID="PartID_'+str(index)+'" POSITION="right" MODIFIED="1347382510988" TEXT="'+partTxt.encode('utf-8')+'">'
    	rows = ce('tr')
    	rows.each(prn_mm_for_sec)
    	# Print the closing tags for this table
    	print >>sys.stderr,"...End of PART, depth="+str(depth)
    	for i in range (0,depth) :
      		print '</node>'
    	print '</node>' #For the part heading containing node
    	depth=0
    	last_dep = 3
Beispiel #10
0
def prn_mm_for_sec(index, node) :
  global last_rowTxt,node_id, curr_dep, last_dep, depth
  ce = PyQuery(node)
  rowTxt = ce.text()
  cols = ce('td')
  curr_dep = len(cols)

  # First close the previous node if required
  #if curr_dep == 1 and cols[0].text() == '' :
  if curr_dep == 1 :
       # This is a blank line which ends a section or sub-sec
       print >>sys.stderr,"...Blank line: End of NODE, depth="+str(depth)
       print >>sys.stderr,"......Last Row Text:"+last_rowTxt
       for i in range (0,depth) :
           print '</node>'
       depth=0
  elif curr_dep == (last_dep + 1) :
    # This means a new nesting starts, just inc. depth
    depth = depth + 1
    if index == 0 :
        print >>sys.stderr,"...Start of new level-2 node: "+rowTxt
  elif (curr_dep + 1) == last_dep :
    # This means a nesting has ended, dec. depth & print 2 end tags
    depth = depth - 1
    print '</node>'
    print '</node>'
  elif curr_dep  == last_dep :
    # This means are at the same level: just end the previous node tag
    print '</node>'
  elif curr_dep >= 3 and  last_dep == 1 :
    # This means start of a new level-1 node
    # DO NOTHING
    print >>sys.stderr,"...Start of new level-2 node: "+rowTxt
    depth = 1
  else :
    print >>sys.stderr,"...Curr dep. is neither one more nor less than prev. depth"
    print >>sys.stderr,"......Curr. dep:"+str(curr_dep)+" last dep:"+str(last_dep)
    print >>sys.stderr,"......Last Row Text:"+last_rowTxt
    print >>sys.stderr,"......Curr. Row Text:"+rowTxt
  # Next print the text for current node if not empty line
  if curr_dep >= 2 :
    nodeTxt = PyQuery(cols[curr_dep - 2]).text()+" "+PyQuery(cols[curr_dep - 1]).text()
    print '<node CREATED="1347382439772" ID="ID_'+str(node_id)+'" MODIFIED="1347382510988" TEXT="'+nodeTxt.encode('utf-8')+'">'
  last_dep = curr_dep
  last_rowTxt = rowTxt
  node_id = node_id + 1