Beispiel #1
0
def getImages(chapter):
	images = []
	html = utils.getUrlContent(chapter)
	doc = lh.fromstring(html)
	for option in doc.cssselect('#pageMenu option'):
		html2 = utils.getUrlContent(MAIN+option.attrib['value'])
		doc2 = lh.fromstring(html2)
		img = doc2.cssselect('#img')[0]
		images.append(utils.Content(option.text_content(), img.attrib['src']))
	return images
Beispiel #2
0
def getChapters(link):
	chapters = []
	html = utils.getUrlContent(link)
	doc = lh.fromstring(html)
	for i,a in enumerate(doc.cssselect('#listing tr a')):
		chapters.append(utils.Content(a.text_content(), MAIN+a.attrib['href'], i+1))
	return chapters
Beispiel #3
0
def search(query):
	results = []
	mangas = utils.getUrlContent(utils.getUrl(SEARCH, query.replace(" ", "+")))
	for manga in mangas.split('\n'):
		if manga:
			obj = manga.split('|')
			results.append(utils.Content(obj[0].strip(), MAIN+obj[4].strip()))
	return results
Beispiel #4
0
 def parse_page_data(self, raw_url,page_info,runtime_status,post_datas={}):
     #获取raw_url的应答页面,并处理好编码问题
     page_encoding = "UTF-8"
     if page_info.encoding.strip():
         page_encoding = page_info.encoding
     #
     raw_url = raw_url.decode("UTF-8","ignore").encode(page_encoding,"ignore")
     page_src = utils.getUrlContent(raw_url,post_datas)
     #
     if page_encoding == "unicode":
         page_src = eval("u'"+page_src+"'").encode('utf-8',"ignore")
     else:
         page_src = page_src.decode(page_encoding,"ignore").encode('utf-8',"ignore")
     
     #开始解析获得页面page_src      
 
     #依据块的定位符,从应答页面中分理出需要详细解析的结果块,可以有多块结果
     block_data_map_list = self.parse_block_match(page_src, page_info.block_match)
             
     #整个页面用正则表示式匹配,匹配结果都要补入块匹配的结果block_data_map_list中每一天记录中   
     for regular_match in page_info.regular_matchs:
         datalist = []   #正则表达式中捕获到的数据
         #使用exp中的正则表达式匹配出相关结果
         for regular in regular_match.regulars:
             tmp_src = page_src  #page_src循环匹配中会多次使用
             tmp_src = self.remove_tags(tmp_src, regular_match.omit_tags)
             pagedata_ret = re.compile(regular)
             tmp_datalist = pagedata_ret.findall(tmp_src)
             datalist.extend(tmp_datalist)
             
         #正则表达式未匹配到值时,前面的解析结果中有相关值则赋值到datalist中   
         if len(datalist)==0:
             tmp_list = []
             tmp_n = 0
             while (regular_match.result+str(tmp_n)) in runtime_status:
                 tmp_list.append("n/a")
                 tmp_n = tmp_n + 1
             datalist.extend(tmp_list)
             
         #   
         tmp_addon_list = [] #新增加的行结果记录
         scroll_str = "" #正则匹配结果折叠                
         for data_i in range(0,len(datalist)):
             #如果is_unique等于一,则只取匹配结果中的第一个值。等于0时取所有的结果
             if regular_match.is_unique == "1" and data_i>0:
                 continue          
             data = datalist[data_i]
             grub_status = {}
             
             #正则表达式中没有括号或者只有一个捕获型括号,返回的字符串list ['qqq', 'hyx',]
             #正则表达式中有多个捕获型括号,返回的字符串list [('qqq', 'hyx'),('12','hellooo'),]
             if type(data) == type("a"):
                 grub_status[regular_match.result+"1"]=data
                 scroll_str = scroll_str + data + "||"
             else:
                 for i in range(0,len(data)):                    
                     grub_status[regular_match.result+str(i+1)]=data[i]
                     scroll_str = scroll_str + data[i] + "||"            
 
             if regular_match.is_scroll!="1":
                 #block_data_map_list记录数不变,每一条数据扩展上grub_status,成为tmp_addon_list
                 for items in block_data_map_list:
                     tmp_map = {}
                     tmp_map.update(grub_status)
                     tmp_map.update(items)
                     tmp_addon_list.append(tmp_map)   
             else:
                 pass
         #将新增加的行记录tmp_addon_list加入到block_data_map_list结果中,记录数增加
         if regular_match.is_scroll!="1":
             if len(tmp_addon_list) == 0:
                 tmp_addon_list = [{}]
             else:
                 block_data_map_list = []
                 for items in tmp_addon_list:
                     if len(items)>0:
                         block_data_map_list.append(items)
         else:
             tmp_addon_list = []
             for items in block_data_map_list:
                 if len(items)>0:
                     tmp_addon_list.append(items)
                 
             block_data_map_list = []
             if len(tmp_addon_list)==0:
                 tmp_addon_list = [{}]
             for items in tmp_addon_list:
                 items[regular_match.result] = scroll_str
                 block_data_map_list.append(items) 
             
 
     return block_data_map_list