def dir_parse(self, page, spider_list, result_list): print page doc = pyq(page) tmp = doc('table[class=tableList]') trl = tmp('tr') for v in trl: td = pyq(v)('td[class=title]') a = td('a') name = a.text().encode("UTF-8").decode("UTF-8") ename = "" print name if len(name) > 1: for uchar in name: #print uchar if is_alphabet(uchar): ename += uchar #elif uchar =='.' or uchar ==' ' or uchar =='&': #ename += uchar elif (uchar == '(' or is_number(uchar)) and len(ename) > 2: break print "xxxx", ename link = "http://banyungong.net/" + a.attr('href') result_list.append( (ename.lower() + "," + link).encode("UTF-8")) return ""
def parse_json(jso): if jso is not None: #判断是否获取到信息 items = jso.get('data').get('cards') for item in items: a = item.get('mblog') if a is None: # 判断是否为空文章 因为空文章无法使用get方法 continue elif a['isLongText']: # 判断是否为长文章 p = all(a['id']) if p.get('data') is None: continue else: b = { 'text': pyq(p.get('data')['longTextContent']).text(), 'date': a['created_at'] } #pyq()调用pyquery去除html符号 with open('text.txt', 'a', encoding='utf-8') as f: f.write(str(b)) f.write('\n') #print(b) elif not a['isLongText']: b = {'text': pyq(a['text']).text(), 'date': a['created_at']} with open('text.txt', 'a', encoding='utf-8') as f: f.write(str(b)) f.write('\n') #print(b) else: print("用户不存在或已注销...")
def dir_parse(self,page,spider_list,result_list): print page doc = pyq(page) tmp = doc('table[class=tableList]') trl = tmp('tr') for v in trl: td= pyq(v)('td[class=title]') a = td('a') name = a.text().encode("UTF-8").decode("UTF-8") ename ="" print name if len(name)>1: for uchar in name: #print uchar if is_alphabet(uchar) : ename += uchar #elif uchar =='.' or uchar ==' ' or uchar =='&': #ename += uchar elif (uchar =='(' or is_number(uchar) ) and len(ename)>2: break print "xxxx",ename link = "http://banyungong.net/" + a.attr('href') result_list.append((ename.lower() +"," +link).encode("UTF-8")) return ""
def get_posts(self, fid, multi=False, index=2, size=100): if multi: for index in range(index, size, 1): time.sleep(random.randint(20, 60)) url = self.forumUrl + "thread-htm-fid-{fid}-page-{page}.html".format( fid=fid, page=index) req = urllib2.Request(url) content = urllib2.urlopen(req).read().decode('gbk') doc = pyq(content) data = doc("#threadlist").children(".nthtr3").items() for item in data: pid = item.children(".subject").attr("id").split("td_")[1] print "当前的帖子id: %s" % pid self.pids.append(pid) time.sleep(random.randint(30, 200)) self.get_post(pid) time.sleep(random.randint(20, 200)) self.reply(pid) else: time.sleep(random.randint(20, 60)) url = self.forumUrl + "thread-htm-fid-{fid}.html".format(fid=fid) req = urllib2.Request(url) content = urllib2.urlopen(req).read().decode('gbk') doc = pyq(content) for item in doc("#threadlist").children(".nthtr3").items(): pid = item.children(".subject").attr("id").split("td_")[1] print "当前的帖子id: %s" % pid self.pids.append(pid) time.sleep(random.randint(30, 200)) self.get_posts(pid) time.sleep(random.randint(20, 300)) self.reply(pid)
def run(self): headers = {'connection': 'close'} response = requests.get(self.url, headers=headers) response.encoding = 'utf-8' column_jq = pyq(response.text) column = column_jq('title').text() parsed_body = html.fromstring(response.text) song_urls = parsed_body.xpath('//a[contains(@href, "/play/")]/@href') new_lyrics = [] for song_url in song_urls: full_url = urlparse.urljoin("http://www.9ku.com", song_url) # base_url ahead r = requests.get(full_url, headers=headers) r.encoding = 'utf-8' # refer to test/get_chinese.py jq = pyq(r.text) # get title, author in song page brief = jq('h2#play_title').text() title = brief.split(' ')[1] author = brief.split(' ')[3] # two types of song pages if jq('div.lrcBox').text(): content = jq('div.lrcBox').text() else: out_url = jq('h2#play_title').find('a').eq(2).attr('href') r_out = requests.get(out_url, headers=headers) r_out.encoding = 'utf-8' # maybe dno't need jq_out = pyq(r_out.text) content = jq_out('div.ciInfo').eq(0).text() new_lyric = Lyric2(column=column, title=title, author=author, content=content) new_lyric.save() print 'get data from %s at %s' % (full_url, time.ctime())
def getPages(self): dirs=[] #doc=pyq(self.url) #while doc("div:contains('Browse Problems')+div+table img").attr('alt')=="FOLDER" and (None in dirs[p].values()): #dirs[p].update(dict.fromkeys([self.root+'/'+a.attr('href') for a in doc("div:contains('Browse Problems')+div+table a")])) #for d,c in dirs[p].items(): dirs.append(self.url) while dirs: curdir=dirs.pop() try: doc=pyq(curdir) except (httplib.IncompleteRead,urllib2.URLError): print "Bug!!!!!!!!!!!!!1" httplib.HTTPConnection._http_vsn = 10 httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' doc=pyq(curdir) #httplib.HTTPConnection._http_vsn = 11 #httplib.HTTPConnection._http_vsn_str = 'HTTP/1.1' if doc("div:contains('Browse Problems')+div+table img").attr('alt')=="FOLDER": print "[folder]",curdir links=doc("div:contains('Browse Problems')+div+table a") for a in links: dirs.append(self.root+'/'+pyq(a).attr('href')) else: print "[file]",curdir self.pages.append(curdir)
def _fetch_query(self, url, page=0): print "-" * 10, " Fetch Page %s " % (page + 1), "-" * 10 print url try: html = urllib2.urlopen(url).read() except urllib2.HTTPError as e: if e.code == 429: print "#" * 10, " 429 Too many request.Sleep %s seconds. " % self._too_many_request_sleep, "#" * 10 eventlet.sleep(self._too_many_request_sleep) return self._fetch_query(url, page) raise e jq = pyq(html) urls = [] user_list = jq(".user-list-item") for i in user_list: name = pyq(i).find(".user-list-info a") href = self._domain + name.attr("href") urls.append(href) users = [] for user in pool.imap(self._fetch_user, urls): users.append(user) if page == 0: max_page_index = jq(".next_page").prev("a").text() users.extend(self._fetch_query_by_page(url, int(max_page_index))) return users
def main(): doc = pyq(filename='html.txt') doc1 = doc('div') doc2 = doc1('a') # print(doc2) TieBaDate = {} try: f = open('source.txt', 'w') except IOError: print("Error: open file failed.") iSum = 0 for i in doc2: tmphref = pyq(i).attr('href') tmptitle = pyq(i).attr('title') strhref = repr(tmphref) strtitle = repr(tmptitle) aryhref = re.findall('/p/(\d+)', strhref) if re.findall('/p/(\d+)', strhref) != [] and re.findall('(.*?)魔枪(.*?)', strtitle) != []: # print(strtitle) # print(strhref) strsource = 'http://tieba.baidu.com/p/%s' % aryhref[0] f.write(strsource) f.write("\n") iSum += 1 AnalyHtml(url=strsource, filePath='') break print('sum :', iSum) f.close()
def _fetch_user(self, url): try: html = urllib2.urlopen(url + "?tab=repositories").read() except urllib2.HTTPError as e: if e.code == 429: print "#" * 10, " 429 Too many request.Sleep %s seconds. " % self._too_many_request_sleep, "#" * 10 eventlet.sleep(self._too_many_request_sleep) return self._fetch_user(url) raise e jq = pyq(html) data = {} data['url'] = url data['name'] = jq(".vcard-fullname").text() data['avatar'] = jq(".avatar").attr("src") data['location'] = jq("[aria-label='Home location']").attr("title") data['email'] = jq("[aria-label=Email] a").text() data['website'] = jq("[aria-label='Blog or website'] a").text() data['join'] = jq(".join-date").attr("datetime") data['followers'] = jq(".vcard-stat-count:eq(0)").text() data['starred'] = jq(".vcard-stat-count:eq(1)").text() data['following'] = jq(".vcard-stat-count:eq(2)").text() data['repositories'] = {} sources = jq(".repo-list-item.source") data['repositories']['source_count'] = len(sources) data['repositories']["source_lang"] = {} for i in sources: lang = pyq(i).find("[itemprop='programmingLanguage']").text() data['repositories']["source_lang"].setdefault(lang, 0) data['repositories']["source_lang"][lang] += 1 return data
def get_jiandan_mm_pic(page_num): url = 'http://jandan.net/ooxx/page-' + str(page_num) html = pyq(url) print('reading ... http://jandan.net/ooxx/page-{0}\n'.format(page_num)) sys.stdout.flush() #print(html) hash_pic_message = {} #获取图片地址 for element in html('li div div.row div.text'): img = pyq(element).find('img') #img = pyq(element)('img') if img != None: id = pyq(element)('span a').text() #id = id.replace("vote-","") hash_pic_message[id]={} hash_pic_message[id]['ID']=id hash_pic_message[id]['URL']=[] hash_pic_message[id]['FileName']=[] if img.attr('org_src') == None: for t in img: url = img(t).attr('src') hash_pic_message[id]['URL'].append(url) hash_pic_message[id]['FileName'].append(get_file_name2(url)) else: for t in img: url = img(t).attr('org_src') hash_pic_message[id]['URL'].append(url) hash_pic_message[id]['FileName'].append(get_file_name2(url)) #获取图片ID和评级 for element in html('li div div.row div.jandan-vote'): id = pyq(element)('a').attr('data-id') #id = id.replace("vote-","") vote = pyq(element).text() reg_vote = 'OO \[ (\d.*) \] XX \[ (\d.*) \]' pattern = re.compile(reg_vote) result = pattern.findall(vote) if result != None: support = result[0][0] unsupport = result[0][1] hash_pic_message[id]["Support"] = support hash_pic_message[id]["UnSupport"] = unsupport if unsupport != "0": scale = float(support) / float(unsupport) else: scale = 0.0 rank = get_scale(scale) hash_pic_message[id]["Scale"] = scale hash_pic_message[id]["Rank"] = rank for value in hash_pic_message.values(): #print(value) pass return hash_pic_message.values()
def getHtmlByPyquery(tUrl): posts =[] from pyquery import PyQuery as pyq r = requests.get(tUrl) doc=pyq(r.text) lis = doc(".car-monthlisting li a") lis = lis[0:100] lis.reverse() i=1 for li in lis: link = pyq(li).attr("href") title = pyq(li).text() print "抓取文章_%s(%s,link:%s)" %(i,title,link) ir = requests.get(link) idoc = pyq(ir.text) content = idoc("#content .entrybody").remove(".wumii-hook").remove("script").remove("ins").remove(".ds-thread").remove("#ds-ssr").remove("div").remove("#comments").html() content = content.replace("\"","\"\""); #print content post = Post() post.category = urllib.quote("notes") + ":段子" post.post_author = "geekzone" post.post_title = title post.post_content = "\""+content+"\"" posts.append(post) i=i+1 return posts
def exportText(section, idx, link): # url = "http://book.kanunu.org/book3/6630/115916.html" # req = urllib2.Request(url) # response = urllib2.urlopen(req).read() fileName = section + "/" + idx + ".html" textFile = open(fileName) mainHtml = textFile.read() textFile.close() html = unicode(mainHtml, "GBK") doc = pyq(html) tables = doc.find("table") a = [] for table in tables: a.append(len(pyq(table).text())) mx = max(a) textIdx = a.index(mx) titleIdx = textIdx - 1 mainText = pyq(tables[textIdx]).find("p").html() # afterTitle = mainText.index(r"<br/>") # mainTitle = mainText[0:afterTitle].replace(u" ", "").replace(u"】", "").replace(u"【", "").strip().encode("UTF-8") # mainTitle = pyq(tables[titleIdx]).text().replace(u"上部 ", "").replace(u"中部 ", "").replace(u"下部 ", "").encode("UTF-8") mainTitle = pyq(tables[titleIdx]).text().encode("UTF-8") outFile = open("Text/" + section + "/" + idx + ".xhtml", "w") outFile.write("<h1>" + mainTitle + "<h1/>") # outFile.write("<p>") outFile.write(mainText.encode("UTF-8")) # outFile.write("<p/>") outFile.write("<p><br/>" + link + "<br/><p/>") outFile.close() titleList.append(mainTitle) return mainTitle
def getSrc(url): text = getInfo(url) doc = pyq(text) cts = doc('.thumb_mov_model').find('a') for i in cts: link = pyq(i).attr('href') src = pyq(getInfo(link))('#example_video_1').find('source').attr('src') yield src
def main(): url = 'http://taiwan.huanqiu.com/news/' #url = 'http://world.huanqiu.com/observation/' #url = 'http://china.huanqiu.com/politics/' doc = pyq(url=url) alist = doc('.pad20 li a') for a in alist: link = pyq(a).attr('href') get_info(link)
def get_pdf_url(self, url_list): pdf_list =[] print(url_list) for i in url_list: # 進去每個報告的網頁,找到pdf的連結網址 r = requests.get(i) doc = pyq(r.content.decode('gbk')) pdf = doc('a').filter(lambda j, this: 'PDF' in pyq(this).text()).eq(0) pdf_list.append(pdf.attr['href']) return pdf_list
def get_proxy_list(self, page_range=15): __all_proxy_list =[] for __page in range(page_range): __url = 'http://proxylist.hidemyass.com/%s#listable' % __page __request = urllib2.Request(__url, headers=self.__headers) __response = urllib2.urlopen(__request) __the_page = __response.read() doc = pyq(__the_page) for __list_idx in doc('#listable tbody>tr')[:]: __tmp = doc(__list_idx).outerHtml() p = pyq(__tmp) for __j in p('style').text().split('\n'): if __j.find('display:none')>0: p.remove(__j.split('{')[0]) p.remove('style') for __j in p('span,div'): if p(__j).attr('style')=='display:none': p(__j).remove() __proxy = {'last_update' : p('td').eq(0).text(), 'ip_address' : p('td').eq(1).text().replace(' ',''), 'port' : p('td').eq(2).text(), 'country' : p('td').eq(3).text(), 'countryIsoCode' : p('td').eq(3).attr('rel'), 'type': p('td').eq(6).text(), 'anon' : p('td').eq(7).text(), 'speed': ''.join( re.findall(u'\d', p('td').eq(4)('.indicator').attr('style').split(';')[0]) ), 'connection_time': ''.join( re.findall(u'\d', p('td').eq(4)('.indicator').attr('style').split(';')[0]) ) } print __proxy __all_proxy_list.append(__proxy) pickle.dump(__all_proxy_list, open('free_proxy_list', 'wb')) __all_proxy_list = pickle.load(open('free_proxy_list' , 'r')) return __all_proxy_list all_count_cnt = {} for __i in __all_proxy_list: if all_count_cnt.has_key(__i['country']): all_count_cnt[__i['country']] = all_count_cnt[__i['country']]+1 else: all_count_cnt[__i['country']] = 1 print all_count_cnt all_count_cnt = {} for __i in __all_proxy_list: if all_count_cnt.has_key(__i['countryIsoCode']): all_count_cnt[__i['countryIsoCode']] = all_count_cnt[__i['countryIsoCode']]+1 else: all_count_cnt[__i['countryIsoCode']] = 1 print all_count_cnt
def _parse_data(self, pyq_node, k, data, debug): keymap = [] path = data['path'] pathlist = path.split(',') node = pyq_node for p in pathlist: if 'attr@' in p: attr = p[5:] value = node.attr(attr) return value elif 'text' == p: if node.text() != None: value = node.text().encode("utf-8") else: value = None return value elif '#' in p: pl = p.split('#') #print pl[0],pl[1] node = node(pl[0].encode("utf-8")).eq(int(pl[1])) if node != None: node = pyq(node) else: return None else: node = node(p.encode("utf-8")) if node != None: #node = pyq(node)(p) node = pyq(node) else: return None if debug: print "DEBUG,p", p print node # for key in data: # if key != 'path': # keymap[k]=[] # break; if len(node) > 0: if debug: print "DEBUG", k print node for d in node: submap = {} for key in data: if key != 'path': res = self._parse_data(pyq(d), key, data[key], debug) submap[key] = res keymap.append(submap) return keymap
def _parse_data(self,pyq_node,k,data,debug): keymap =[] path = data['path'] pathlist = path.split(',') node = pyq_node for p in pathlist: if 'attr@' in p: attr = p[5:] value= node.attr(attr) return value elif 'text' == p: if node.text() != None: value = node.text().encode("utf-8") else: value = None return value elif '#' in p: pl = p.split('#') #print pl[0],pl[1] node = node(pl[0].encode("utf-8")).eq(int(pl[1])) if node !=None: node = pyq(node) else: return None else: node = node(p.encode("utf-8")) if node!=None: #node = pyq(node)(p) node = pyq(node) else: return None if debug: print "DEBUG,p",p print node # for key in data: # if key != 'path': # keymap[k]=[] # break; if len(node )> 0: if debug: print "DEBUG",k print node for d in node: submap ={} for key in data: if key != 'path': res = self._parse_data(pyq(d),key,data[key],debug) submap[key] = res keymap.append(submap) return keymap
def getRound(self): homepyq = pyq(self.rsp) uri = homepyq('ul.lpage_race_nav.clearfix').children().eq(1).find( 'a').attr('href') url = self.pagehost + uri rsp = urllib.urlopen(url).read() jfpyq = pyq(rsp) countRnd = jfpyq('ul.lsaiguo_round_list.clearfix').children().length self.rnd = jfpyq('ul.lsaiguo_round_list.clearfix').children( 'li.on').find('a').text() return countRnd
def parse(self, url): # 解析第一页商品列表 res = requests.get(url) assert res.status_code == 200 jq = pyq(res.content) goods_list = jq('.list-container>ul>li>a') for r in goods_list: goods_url = r.get('href') if not goods_url: continue goods_url = '%s%s' % (CosstoresGoodsListPrase.COSSTORES_HOST, goods_url) goods_name = r.get('title') # print goods_url, goods_name goods_item = { 'url' : goods_url, 'name' : goods_name, } self.goods_list.append(goods_item) # 解析ajax动态请求的商品列表页,第2-n页 next_page = jq('#infiload_nav>a') if next_page: next_page = next_page[0] max_page = int(next_page.get('data-maxpage')) next_url = next_page.get('href') np = re.findall('page=(\d+)', next_url) if not np: return np = int(np[0]) while np <= max_page: next_url = re.sub('page=(\d+)', 'page=%s' % (np), next_url) np += 1 res = requests.get('%s%s' % (CosstoresGoodsListPrase.COSSTORES_HOST, next_url)) assert res.status_code == 200 jq_page = pyq(res.content) goods_list = jq_page('li>a') if not goods_list: # 解析完了 break for r in goods_list: goods_url = r.get('href') if not goods_url: continue goods_url = '%s%s' % (CosstoresGoodsListPrase.COSSTORES_HOST, goods_url) goods_name = r.get('title') goods_item = { 'url' : goods_url, 'name' : goods_name, } self.goods_list.append(goods_item)
def getFunctions(url): apihost = 'file:///opt/Aldebaran Robotics/Choregraphe Suite 2.1/share/doc/naoqi/' if url == '#': return url = apihost + url doc = pyq(urllib2.urlopen(url).read()) for nodefunction in doc('.function'): func = pyq(pyq(nodefunction).children()[0]) funcName = func('.descname').text() module = func('.descclassname').text().split('::')[0].split('Proxy')[0] params = [] for param in func('em'): params.append(pyq(param).text()) if not codes.has_key(module): codes[module] = '' codes[module] += 'from naoqi import ALProxy\n' codes[module] += 'from network.const import PORT\n' codes[module] += 'from lib.cmd_parser import Cmd\n' codes[module] += 'from lib.elementParser import parse\n\n' codes[ module] += 'proxy = ALProxy(\'' + module + '\', \'127.0.0.1\', PORT)\n\n' codes[module] += 'def proceed(cmd):\n' codes[module] += '\tcmd = Cmd(cmd)\n' codes[module] += '\tcmd.removeHead()\n' codes[ module] += '\tprint \'going to function:\', cmd.getCommand()\n' codes[module] += '\tfunc = globals().get(cmd.getCommand())\n' codes[module] += '\tif func:\n' codes[module] += '\t\treturn func(cmd.getValues(\'p\'))\n' codes[module] += '\telse:\n' codes[ module] += '\t\tprint \'Error: Cannot find command:\' + cmd.getCommand()\n' codes[ module] += '\t\treturn \'Error: Cannot find command:\' + cmd.getCommand()\n\n' codes[module] += 'def ' + funcName + '(params):\n' if params: codes[module] += '\tif len(params) < ' + str(len(params)) + ':\n' codes[ module] += '\t\tprint \'Error: function \\\'' + funcName + '\\\' takes 2 params\'\n' codes[ module] += '\t\treturn \'Error: function \\\'' + funcName + '\\\' takes 2 params\'\n' for i in range(len(params)): codes[module] += '\t' + params[i] + ' = parse(params[' + str( i) + '])\n' codes[module] += '\treturn proxy.' + funcName + '(' if params: codes[module] += params[0] for i in range(1, len(params)): codes[module] += ',' + params[i] codes[module] += ')\n\n'
def get_betting_odds_info_list(self): h = requests.get(self.url, timeout = self.timeout) #, proxies = self.proxies text = h.content pq = pyq(text) betting_odds_info_list = [] startdate_html = pq('.event-holder.holder-scheduled>.eventLine.status-scheduled') url_html = pyq(startdate_html)('meta[itemprop=\'url\']') matchup_html = pyq(startdate_html)('meta[itemprop=\'name\']') for i in range(len(startdate_html)): betting_odds_info_list.append({'start_time': startdate_html.eq(i).attr('rel'), 'url': url_html.eq(i).attr('content'), 'away_team': matchup_html.eq(i).attr('content').split(' vs ')[0], 'home_team': matchup_html.eq(i).attr('content').split(' vs ')[1]}) return betting_odds_info_list
def get_page_book_info(url, book): html = pyq(url) next_link = None print('reading ... {0}\n'.format(url)) sys.stdout.flush() #获取图书信息 for element in html('ul.list li.o'): o_img = pyq(element)('div.o-img') o_info = pyq(element)('div.o-info') link = o_img('a').attr('href') img_src = o_img('img').attr('src') o_name = pyq(element)('h3.o-name a').text() o_author = pyq(element)('p.o-author a').text() o_ext = pyq(element)('p.o-ext').text() o_cate = pyq(element)('p.o-cate a').text() o_data = pyq(element)('p.o-data i.icon').text() t_temp = o_data.split(" ") if t_temp != None: o_click = t_temp[0] o_download = t_temp[1] print(o_name, o_author, link, img_src, o_ext, o_cate, o_click, o_download) sys.stdout.flush() index = len(book) + 1 book[index] = {} book[index]["Index"] = index book[index]["Name"] = o_name book[index]["Author"] = o_author book[index]["Tag"] = o_cate book[index]["EXT"] = o_ext book[index]["Link"] = link book[index]["Picture"] = img_src book[index]["Click_Number"] = o_click book[index]["Download_Number"] = o_download #获取页面中下一页链接 for link in html('ul.paging li a'): if pyq(link).text() == '下一页': next_link = pyq(link).attr('href') if next_link != None: return book, next_link else: return book, None
def on_parse(self,resp): if resp.code == 200: body = resp.body.decode('gbk') doc = pyq(body) items = doc('#r table') lst = [] for item in items: _item = pyq(item) _news = {} _news['title'] = _item('.text b').text() _news['source'] = _item('font nobr').html() _news['body'] = _item('font[size="-1"]').text() _news['url'] = pyq(_item('.text a')[0]).attr('href') lst.append(_news) self._callback(lst)
def getProblemMax(self): self.getVolumeCount() d = pyq(url = self.baseUrl + self.voluemPath + str(self.voluemCount)) self.problemMax = int(d('#content_body > form:nth-child(1) > table > tr:last-child > td.problemId > a > font').text()) self.problemCount = self.problemMax - 1001 + 1 return self.problemMax
def fromLeagueidGetTeamid2(self): url_ = self.urldomain + "/league/%s/jifen" % self.leagueid send_headers = { 'Host': 'www.dszuqiu.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Connection': 'keep-alive' } req = urllib2.Request(url_,headers=send_headers) htm_rsp = urllib2.urlopen(req).read() mpyq = pyq(htm_rsp) trs = mpyq('table.live-list-table tbody tr') teamlist = [] for i in range(trs.length): tds = trs.eq(i).find('td') if tds.length > 0: jq_as = tds.eq(2).find('a') print jq_as.text() ttid = jq_as.eq(0).attr('href').split('/')[-1] if ttid in teamlist: continue teamlist.append(ttid) print ttid + " " + jq_as.eq(0).text() self.team_dict.setdefault(ttid, jq_as.eq(0).text()) return self.team_dict
def parsePost(self, response): def filterRule(url): if '/wp-content/uploads/' in url: return url d = pyq(response.body) post = { "url": response.url, "title": d('h1.entry-title').text(), "category": response.css('span.cat-links > a::text').extract()[0], "datetime": response.css('time.entry-date::text').extract()[0], "author": response.css('span.vcard > a::text').extract()[0], "content": d('div.entry-content').text(), "img": filter(filterRule, response.css('img::attr(src)').extract()), } self.postcollection.update({"url": post['url']}, post, True) ''' the scheduler of yield here is different from that in tornado or twisted, it will call `next()` immediately, rather than the IO has completed so just use yield, it is still in parallel ''' for url in post['img']: yield Request(url, callback=self.saveImage)
def on_parse(self, resp): if resp.code == 200: body = resp.body.decode('gbk') doc = pyq(body) items = doc('#r table') lst = [] for item in items: _item = pyq(item) _news = {} _news['title'] = _item('.text b').text() _news['source'] = _item('font nobr').html() _news['body'] = _item('font[size="-1"]').text() _news['url'] = pyq(_item('.text a')[0]).attr('href') lst.append(_news) self._callback(lst)
def parseThread(self, response): url = response.url.replace('http://bbs', 'http://www') reply = [] for floor in response.css('div.tpc_content').extract(): reply.append(pyq(floor).text()) self.collection.update({"url": response.url}, {'$set': {"reply": reply}}, True)
def tongji(headers, url, begin, end): ''' 用于统计某座楼的指定楼层间的id发帖数并由多到少排序 ''' links = [ url+"%d" %i for i in range(int(begin),int(end) + 1 ) ] mydict = {} for url in links: response, content = http.request(url, 'GET', headers=headers) doc = pyq(content) for i in range(13,77,7): if doc("tr").eq(8).text().decode('utf-8') == "提示:本主题启用了“允许发言针对特定用户”功能,您可以单击“回复主题”创建针对特定用户的回复,或单击每一楼层的“答复”按钮快速创建该楼层发表者才可见的回复。": i += 1 try: name = doc("tr").eq(i) s = name.text().decode('utf-8') # print s, if not s in mydict: mydict[s] = 1 else: mydict[s] += 1 except BaseException: pass delstr = "管理选项 : 修复 | 解锁 | 提升 | 下沉 | 删除 | 移动 | 高亮 | 固顶 | 总固顶 | 区固顶 | 解除保存 |" delstr = delstr.decode('utf-8') if delstr in mydict: del mydict[delstr.decode('utf-8')] mydict = sorted(mydict.iteritems(), key=itemgetter(1), reverse=True) return mydict
def get(self, template_variables={}): url = self.get_argument("url", "") template_variables["tmall_link"] = url template_variables["static_path"] = self.static_path if is_weixin_browser(self) or is_mobile_browser(self): tmall_pattern = re.compile( r'http://detail.tmall.com/item.htm?\S*id=(\d+)') tmall_match = tmall_pattern.search(url) if tmall_match: sku = tmall_match.group(1) doc = pyq( "http://djaa.cn/ajax/cm_details/to_cm_details_tmall.php", headers={ 'User-Agent': 'Mozilla/5.0 (MicroMessenger;iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4' }, method='post', data={ 'id': sku, 'shopUrl': url, 'shop_type': 'tmall', 'small_shop_type': 'cm_details' }) #print doc title = doc('.dtif-h').text() content = doc('.viewport').outerHtml() template_variables["title"] = title template_variables["content"] = content template_variables["sku"] = sku self.render(self.template_path + "tmall.html", **template_variables) else: self.redirect(url)
def getTeamPageData(self, url): print 'url:%s' % url send_headers = { 'Host': 'www.dszuqiu.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Connection': 'keep-alive' } req = urllib2.Request(url, headers=send_headers) tmp_rsp = urllib2.urlopen(req).read() if tmp_rsp != '' or tmp_rsp != None: mpyq = pyq(tmp_rsp) sq_tables = mpyq('section.content.active table.live-list-table' ) #可能会有2个table,未开始和已结束 target_tb = sq_tables.eq(-1) # trs = mpyq('section.content.active table.live-list-table tbody tr') #error trs = target_tb.find('tbody tr') for i in range(trs.length): tds = trs.eq(i).children( ) #英冠 2016/10/29 22:00 [10] 利茲 v 伯顿阿尔比恩 [15] ( -0.5 / 2.0,2.5 / 10.5 ) - -- - -- - -- - -- 析 if tds.eq(10).text().find('-') == -1: tmplist = [tds.eq(j).text() for j in range(tds.length)] self.teamData.append(tmplist)
def getStat(mid): import gzip import StringIO url_stat = "http://odds.500.com/lq/stat.php?id=" + mid request = urllib2.Request(url_stat) request.add_header('Accept-encoding', 'gzip') opener = urllib2.build_opener() mFile = opener.open(request) isGzip = mFile.headers.get('Content-Encoding') if isGzip == 'gzip': compresseddata = mFile.read() compressedstream = StringIO.StringIO(compresseddata) gzipper = gzip.GzipFile(fileobj=compressedstream) stat_rsp = gzipper.read() else: stat_rsp = mFile.read() stpyq = pyq(stat_rsp) subSceList = [] # 后面两个tr才是单节比分项 tr_a = stpyq('tr#bf_away') tr_h = stpyq('tr#bf_home') tds_a = tr_a.find('td') tds_h = tr_h.find('td') # td项去头去尾 for i in range(tds_a.length - 2): tmpSubSce = ' %s-%s' % (tds_a.eq(i + 1).text(), tds_h.eq(i + 1).text()) subSceList.append(tmpSubSce) # print tmpSubSce return subSceList
def login(): #get_login_url jw2005_url = 'http://jw2005.scuteo.com/' response = request(url=jw2005_url) #http://222.201.132.117/(nouuvu55yi1bpk45tz3rhkjy)/default2.aspx array = response.geturl().split("/") array[4] = "default6.aspx" login_url = "/".join(array) #http://222.201.132.117/(nouuvu55yi1bpk45tz3rhkjy)/default6.aspx doc = pyq(url=login_url) viewstate = doc("#Form1 input")[0].value id = 123 passowrd = 123 values = { '__VIEWSTATE' : viewstate, # tname: # tbtns: 'tnameXw':"yhdl", 'tbtnsXw':"yhdl|xwxsdl", 'txtYhm':id, # txtXm: 'txtMm':passowrd, "rblJs":"", "btnDl":""#.decode('gbk').encode('gbk'), } headers = {} response, response_cookie = request(login_url, values, headers, True) return response.geturl()
def AnalyHtml(url,filePath): if filePath != '': pass else: htl = pyq(url = url) htl2 = htl('cc') for i in htl2: htl3 = pyq(i) htl4 = htl3('div') if htl4.find('img'): # print(htl4) print(htl4('img').attr('src')) else: # print(htl3('div').text()) pass
def __init__(self, forumUrl, userName, password, proxy=None): ''' 初始化论坛url、用户名、密码和代理服务器 ''' self.forumUrl = forumUrl self.userName = userName self.password = password self.formhash = '' self.isLogon = False self.isSign = False self.xq = '' self.postOldData = {} self.get_post_form_data = {} self.jar = cookielib.CookieJar() self.pids = [] self.get_reply_content = [ u"顶[s:53] [s:53] ", u"菱湖人顶个贴", u"[s:48] [s:48] [s:48] 顶顶", u"老菱湖人来顶顶帖子", u"混个脸熟[s:48] [s:48]", u"[s:89] [s:89] [s:89] [s:89] ", u"[s:77] [s:77] [s:77] 菱湖人路过", u"[s:53][s:53]顶[s:53]", u"顶顶顶[s:53][s:53]", u"[s:53]路过[s:53][s:53]", u"走走看看[s:53][s:53]", u"老菱湖人看看[s:53][s:53]", u"有没有菱湖的[s:53]" ] if not proxy: openner = urllib2.build_opener( urllib2.HTTPCookieProcessor(self.jar)) else: openner = urllib2.build_opener( urllib2.HTTPCookieProcessor(self.jar), urllib2.ProxyHandler({'http': proxy})) urllib2.install_opener(openner) req = urllib2.Request(forumUrl + "/login.php?") content = urllib2.urlopen(req).read() doc = pyq(content) for item in doc("form").children("input").items(): self.postOldData[item.attr("name")] = item.val()
def login(): #get_login_url jw2005_url = 'http://jw2005.scuteo.com/' response = request(url=jw2005_url) #http://222.201.132.117/(nouuvu55yi1bpk45tz3rhkjy)/default2.aspx array = response.geturl().split("/") array[4] = "default6.aspx" login_url = "/".join(array) #http://222.201.132.117/(nouuvu55yi1bpk45tz3rhkjy)/default6.aspx doc = pyq(url=login_url) viewstate = doc("#Form1 input")[0].value id = 123 passowrd = 123 values = { '__VIEWSTATE': viewstate, # tname: # tbtns: 'tnameXw': "yhdl", 'tbtnsXw': "yhdl|xwxsdl", 'txtYhm': id, # txtXm: 'txtMm': passowrd, "rblJs": "", "btnDl": "" #.decode('gbk').encode('gbk'), } headers = {} response, response_cookie = request(login_url, values, headers, True) return response.geturl()
def getPlayerInfo(playerid): html = getHtml('www.csgola.com', '/player/' + playerid) q = pyq(html) avatar = q('img.avatar.center-block.img-responsive').attr('src') playername = q('.personaname').text() statTit = q('.col-md-10 .title').text().encode('utf-8') statVal = q('.col-md-10 .datala').text().encode('utf-8') chartVal = q('.polar-detail .datala').text().encode('utf-8') json = { 'error': 0, 'playerinfo': { 'avatar': avatar, 'name': playername, }, 'stats': { 'jishashu': statVal[0], 'baotoulv': statVal[1], 'kd': statVal[2], 'shenglv': statVal[3], 'zhengwangshu': statVal[4], 'mingzhonglv': statVal[5], 'juanzengwuqi': statVal[6], 'mvpcishu': statVal[7], }, 'chart': { 'zonghe': chartVal[0], 'kd': chartVal[1], 'mingzhonglv': chartVal[2], 'baotoulv': chartVal[3], 'shenglv': chartVal[4], }, } return json
def movie_links_range(year, index): base_url="https://movie.douban.com/tag"; resource_url = "%s/%d?start=%d&type=T" %(base_url, year, index); rtree = pyq(url=resource_url); print resource_url; items = rtree('.nbg'); rst_list = []; ##for idx in range(1): for idx in range(len(items)): link = items.eq(idx).attr('href'); title = items.eq(idx).attr('title').encode("UTF-8"); rst_list.append((link, title)) pass ; ## actors_list = []; ## items = rtree('.item'); ## ##print items; ## for idx in range(len(items)): ## actors = items.eq(idx)('td')[1].find_class('pl')[0].text_content().encode("utf-8"); ## si = actors.rfind(") /"); ## if -1 != si: ## actors = actors[si+3:]; ## actors_list.append(actors); movie_detail_list = []; for item in rst_list: movie_detail = get_movie_detail_by_link(item[0]); if movie_detail is not None: movie_detail["link_info"] = item; movie_detail_list.append(movie_detail); pass ; return movie_detail_list;
def kvartira(self, response): pyquery = pyq(url=response.url) dt = Handler.today(resp_date=response.doc('div.item_title').text().split(']')[0].replace('[', '')) number = re.compile(r'\w+.+?(\d+)') text_nomer = response.doc('.content span.date_update').text().replace('\t','').replace('\n', '') number = number.findall(text_nomer) content = {name.text_content().encode('latin1').decode('utf8').replace("\t", '').replace("\n", ''): pyquery("tr td.bold + td").eq(i).text().encode('latin1').decode('utf8') for i, name in enumerate(pyquery.find("td.bold"))} contact = {name.text_content().encode('latin1').decode('utf8').replace("\t", '').replace("\n", ''): pyquery("tr td.caption + td").eq(i).text().encode('latin1').decode('utf8') for i, name in enumerate(pyquery.find("td.caption"))} content.update({ "url": response.url, "Крошки": response.doc('div.content div.item_title span.bold').text(), "Дата парсинга": str(date.today()), "Обьявление":pyquery.find("div.content table tbody tr td[colspan]").text().encode('latin1').decode('utf8'), "Номер обьявления": number[0], "Дата публикации": dt, "Фото": Handler.url_photo(response.doc('div.thumb-carousel div.thumb a.image_thumb').items()), "Путь скриншота": selen.screen(response.url), }) content.update(contact) return content
def get_url(): __site = 'https://www.ptt.cc' __req_url = __site + '/bbs/Tech_Job/index.html' __idx = 1 all_url = [] while True: #print "this page1 = %s" % __req_url try: __response = urllib2.urlopen(__req_url, timeout=9999) __the_page = __response.read() doc = pyq(__the_page) except: continue doc.make_links_absolute(base_url=__site) for __i in doc('div.title a'): #print doc(__i).text() #print 'https://www.ptt.cc' + doc(__i).attr('href') all_url.append(doc(__i).attr('href')) __req_url = doc('.btn.wide').eq(1).attr('href') __idx += 1 if __idx > 2: break if __req_url is None: break return all_url
def parse(self, response): l = ItemLoader(item=Problem(), response=response) d = pyq(response.body) l.add_value('id', response.url[-4:]) l.add_value('title', d('#content_body > center:nth-child(1) > span').text()) l.add_value('body', d('#content_body').text()) return l.load_item()
def parsePost(self, response): def filterRule(url): if '/attachment/' in url: return url d = pyq(response.body) post = { "url": response.url, "title": response.css('#passage-title::text').extract()[0], "category": response.css('div.list-title-word::text').extract()[0], "datetime": response.css('#passage-info::text').extract()[0].split(' | ')[0], "hit": response.css('#passage-info::text').extract()[0].split(' | ')[1], "detail": d('#passage-detail').text(), "img": filter(filterRule, response.css('img::attr(src)').extract()), } self.collection.update({"url": post['url']}, post, True) ''' the scheduler of yield here is different from that in tornado or twisted, it will call `next()` immediately, rather than the IO has completed so just use yield, it is still in parallel ''' for url in post['img']: yield Request(url, callback=self.saveImage)
def translate(): url = r'http://dict.youdao.com/search?q=' + sys.argv[1] doc = pyq(url) doc('#custheme').remove() doc('.c-topbar').remove() doc('.c-subtopbar').remove() doc('.c-header').remove() doc('#c_footer').remove() doc('.c-bsearch').remove() doc('#ads').remove() doc('#rel-search').remove() doc('.error-wrapper').remove() doc('#topImgAd').remove() doc('#container').css('margin', '0') doc('#container').css('width', '500px') doc('#results').css("margin-left", "20px") doc('#results-contents').css('width', '480px') doc('#results-contents').css('margin', '0') doc('#result_navigator').css('left', '380px') #result_navigator不能删掉,否则无法切换解释Tab doc('#result_navigator').css('display', 'none') for a in doc('a'): href = a.get('href') if href is not None and href.startswith('/'): a.make_links_absolute('http://dict.youdao.com') link = u"<a href='" + url + u"'>在浏览器中查看翻译</a>" doc('#results-contents').append(link) print doc.outerHtml()
async def postemail(self, pe_url, mailform, getconfig, getauth): _LOGGER.info("Sending email form...") postemail = await self.execRequest({ "url": pe_url, "params": mailform, "headers": { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.82", "origin": getconfig["issuer"], # "issuer" from config "accept-language": "de-de", "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148", "referer": getauth.url }, "method": "POST" }) _LOGGER.info("Done!") _LOGGER.info("Parsing email form response...") pqpe = pyq(postemail.text) pwform = dict([ (t.attrib["name"], t.attrib["value"]) for t in pqpe("#credentialsForm").find("[type='hidden']") ]) pwform["password"] = self.config["password"] ppwurl = getconfig["issuer"] + pqpe( "#credentialsForm")[0].attrib["action"] _LOGGER.info("Done!") await self.postpw(ppwurl, pwform, getconfig, postemail)
def __init__(self,dest): self.url = dest doc = pyq(url = self.url) records = doc('.list-img') for record in records: houseInfo = HouseInfo() titleLine = record.cssselect('.list-info-title')[0]; houseInfo.title = self.getAttr(titleLine,'title') houseInfo.link = '%s%s' % (r'http://bj.ganji.com',self.getAttr(titleLine,'href')) district = record.cssselect('.list-word-col')[0] houseInfo.district = district.text_content() subway = record.cssselect('.list-word-col')[1] houseInfo.subway = subway.text_content() dist = record.cssselect('.list-word')[0] houseInfo.distance = dist.text_content().split('-')[-1].decode('utf-8') #(houseInfo.kind,houseInfo.area,houseInfo.decoration,houseInfo.floor,houseInfo.direction) = record.cssselect('.list-word')[1].text_content().split('/') props = record.cssselect('.list-word')[1].text_content().split('/') houseInfo.kind = props[0] houseInfo.area = props[1] houseInfo.kind = props[2] houseInfo.floor = props[3] houseInfo.direction = props[4] self.houses.append(houseInfo) #print(record.cssselect('.list-word')[1].text_content()) for house in self.houses: print(house.title)
def parsePost(self, response): def analysys(response): try: d = pyq(response.css('div#artibody').extract()[0]) data = { "url": response.url, "title": response.css('h1#artibodyTitle::text').extract()[0], "body": d.text(), "date": response.css('span#pub_date::text').extract()[0], "parsed": "1" } return data except IndexError, e: pass try: d = pyq(response.css('td[valign="top"]').extract()[2]) data = { "url": response.url, "title": response.css('font[size="5"]::text').extract()[0], "body": d.text(), "date": response.css('font[face="Arial"]').extract()[0], "parsed": "2" } return data except IndexError, e: pass
def getNbySingle(self, teamid, teamname): url = 'http://liansai.500.com/team/' + str(teamid) print url headers = { 'Host': 'liansai.500.com', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': 'text/html, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36', 'Accept-Encoding': 'gzip,deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', } rdata = None req = urllib2.Request(url, rdata, headers) rsp = urllib2.urlopen(req) if rsp.info().get('Content-Encoding') == 'gzip': buf = StringIO(rsp.read()) f = gzip.GzipFile(fileobj=buf) data = f.read() mypqy = pyq(data) mtbody = mypqy('table.lcur_race_list tbody') trs = mtbody.find('tr') singleRd_list = [] for i in range(5): tds = trs.eq(i).find('td') singleRd_list.append( [teamname.encode('gbk'), tds.eq(5).text().encode('gbk')]) print '%s %s' % (teamname, tds.eq(5).text()) return singleRd_list
def get_url(): __site = 'https://www.ptt.cc' __req_url = __site + '/bbs/Tech_Job/index.html' __idx= 1 all_url = [] while True: #print "this page1 = %s" % __req_url try: __response = urllib2.urlopen(__req_url, timeout=9999) __the_page = __response.read() doc = pyq(__the_page) except: continue doc.make_links_absolute(base_url=__site) for __i in doc('div.title a'): #print doc(__i).text() #print 'https://www.ptt.cc' + doc(__i).attr('href') all_url.append(doc(__i).attr('href')) __req_url = doc('.btn.wide').eq(1).attr('href') __idx += 1 if __idx > 2: break if __req_url is None: break return all_url
def parse(self, url): res = requests.get(url) assert res.status_code == 200 jq = pyq(res.content) self.url = url self.price = jq('.PriceContainer').text() self.color = jq('.colorLabel').text() self.name = jq('.productInfo>h1').text() category_id = re.findall('/(\d+)-', url) self.category_id = category_id[0] if category_id else '' images = jq('.productSlideshow>ul>li>div>img') image_list = [] for r in images: image_url = r.get('src') if not image_url: continue image_list.append('%s%s' % (CosstoresGoodsPrase.COSSTORES_HOST, image_url)) self.image = image_list first_image = image_list[0] if image_list else '' goods_id = re.findall('/(\d+)/', first_image) self.goods_id = str(goods_id[0]) if goods_id else '' # ajax动态请求 goods_detail_ids = jq('.productSizes>label>input') goods_detail_id = goods_detail_ids[0].get('value') if goods_detail_ids else '' if goods_detail_id: goods_detail_url = 'http://www.cosstores.com/gb/product/GetVariantData?variantId=%s&lookID=null&image=0' % (goods_detail_id) res = requests.get(goods_detail_url) assert res.status_code == 200 result = res.json() self.code = result.get('HMOrderNo', '') self.original_price = result.get('DefaultPriceWithCurrency', '') self.price = result.get('PriceWithCurrency', '') self.attributes = result.get('Attributes', []) self.details = result.get('DescriptionShort', '')
def getMatchid_cl(self, lgid): mypyq = pyq(self.rsp) trs_euro = mypyq('table.lcur_race_list tbody tr') for i in range(trs_euro.length): tds = trs_euro.eq(i).find('td') matid = tds.eq(6).find('a').attr('href').split('-')[-1].split( '.')[0] self.mmatchid_list.append(matid)
def request_weixin(query ,qtype=1): url = 'http://weixin.sogou.com/weixin?type=%d&query=%s' doc = pyq(url=(url % (qtype, query))) weixin_list = doc(".results>div").items() for item in weixin_list: openid = item.attr['href'][12:] name = item(".txt-box>h3").text() weixin_num = item(".txt-box>h4>span").text()[4:] print(name + ": " + weixin_num + " " + openid)
def get_proxy_list(self): r = requests.get(gPROXY, timeout=5) doc = pyq(r.content) tr_list = [i.text() for i in doc('#proxylisttable').find('tr').items()] tr_list = [i for i in tr_list[1:] if i] proxy_list = [i.split('/n')[:2] for i in tr_list] proxy_list = [':'.join(i) for i in proxy_list] proxy_list = [{'http': i, 'https': i} for i in proxy_list] return proxy_list
def dir_parse(self,page,spider_list,result_list): #print page doc = pyq(page) tmp = doc('div[class=article]') tl = tmp('tr[class=item]') #print tl for tr in tl: dl = pyq(tr)('div[class=pl2]') #print dl a = dl('a') print a.attr('href') result_list.insert(0,a.attr('href')) next = doc('span[class=next]') a = next('a').attr('href') if a is not None and len(a)>5: print a.encode("UTF-8") spider_list.append(a.encode("UTF-8")) return
def format_html(self, html): if html == None and html == '': return '' try: doc = pyq(html) text = doc.text() result = ' '.join(text.replace('\n', ' ').split()) except Exception, e: result = html
def get_citys(self, fid=0): url = self._url.format(fid) doc = pyq(url=url) text = doc.text()[21:-1] try: return json.loads(text) except: print text return []
def getPreviousSingle(self, matchid): url = 'http://odds.500.com/fenxi/shuju-' + str(matchid) + '.shtml' # url = 'http://live.500.com/' print url # rsp = urllib.urlopen(url).read() headers = { 'Host': 'odds.500.com', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': 'text/html, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36', 'Accept-Encoding': 'gzip,deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', } rdata = None req = urllib2.Request(url, rdata, headers) rsp = urllib2.urlopen(req) if rsp.info().get('Content-Encoding') == 'gzip': buf = StringIO(rsp.read()) f = gzip.GzipFile(fileobj=buf) data = f.read() # print "rsp:%s" % data mypyq = pyq(data) trs = mypyq('body div#team_jiaozhan tr[fid]:gt(0)') singleRd_list = [] print trs #从index=2开始 if trs.length > 3: mlen = 3 else: mlen = trs.length for i in range(mlen): tds = trs.eq(i).find('td') singleRd_list.append([ ''.join(tds.eq(1).text().split('-')), re.sub('\[\d+\]\s', '', tds.eq(2).find('span.dz-l').text()).encode('gbk'), ' ' + ''.join(tds.eq(2).find('em').text().split(' ')), re.sub('\s\[\d+\]', '', tds.eq(2).find('span.dz-r').text()).encode('gbk') ]) print '%s %s %s %s' % (''.join(tds.eq(1).text().split('-')), re.sub('\[\d+\]\s', '', tds.eq(2).find('span.dz-l').text()), ''.join( tds.eq(2).find('em').text().split(' ')), re.sub('\s\[\d+\]', '', tds.eq(2).find('span.dz-r').text())) return singleRd_list
def getname(headers): ''' 从发帖排行找到ID,然后得到生日,QQ,并存入文本。具体作用自己分析 ''' url = "http://www.cc98.org/toplist.asp?orders=1&page=" file = open('id.txt', 'w+') links = [url + "%d" % i for i in xrange(100, 150)] for link in links: response, content = http.request(link, 'GET', headers=headers) soup = BeautifulSoup(content) get = soup.findAll("td", {"class": "tablebody1"}) for i in xrange(0, 100, 5): userid = get[i] endurl = userid.find("a")['href'] idurl = "http://www.cc98.org/" + endurl idresponse, idcontent = http.request(idurl, 'GET', headers=headers) idcontent = BeautifulSoup(idcontent) name = get[i].find("a").string name = pyq(name).text() # 输入ID file.write(name) bir = idcontent.findAll("td", {"class": "tablebody1"}, {"style": "line-height:150%"})[-1] for br in bir.findAll('br'): next = br.nextSibling beg = next.find("生 日: ") if beg == 0: pyear = next.find(" 年") pmon = next.find(" 月") pday = next[pmon:].find(" 日") year = next[beg + 5:pyear] mon = next[pyear + 2:pmon] day = next[pmon + 2:pday + pmon] if year != '': # print "year = " + year # print "mon = " + mon # print "day = " + day # 输入生日 file.write(" : " + year + mon + day) beg = next.find("QQ : ") if beg == 0: qq = next[beg + 5:] # print qq # 输入QQ file.write(" : " + qq + '\n') file.close()