Ejemplo n.º 1
0
    def dir_parse(self, page, spider_list, result_list):
        print page
        doc = pyq(page)
        tmp = doc('table[class=tableList]')
        trl = tmp('tr')
        for v in trl:
            td = pyq(v)('td[class=title]')
            a = td('a')
            name = a.text().encode("UTF-8").decode("UTF-8")
            ename = ""
            print name
            if len(name) > 1:
                for uchar in name:
                    #print uchar
                    if is_alphabet(uchar):
                        ename += uchar
                    #elif uchar =='.' or uchar ==' ' or uchar =='&':
                    #ename += uchar
                    elif (uchar == '(' or is_number(uchar)) and len(ename) > 2:
                        break
                print "xxxx", ename

                link = "http://banyungong.net/" + a.attr('href')
                result_list.append(
                    (ename.lower() + "," + link).encode("UTF-8"))

        return ""
Ejemplo n.º 2
0
def parse_json(jso):
    if jso is not None:  #判断是否获取到信息
        items = jso.get('data').get('cards')
        for item in items:
            a = item.get('mblog')
            if a is None:  # 判断是否为空文章 因为空文章无法使用get方法
                continue
            elif a['isLongText']:  # 判断是否为长文章
                p = all(a['id'])
                if p.get('data') is None:
                    continue
                else:
                    b = {
                        'text': pyq(p.get('data')['longTextContent']).text(),
                        'date': a['created_at']
                    }  #pyq()调用pyquery去除html符号
                    with open('text.txt', 'a', encoding='utf-8') as f:
                        f.write(str(b))
                        f.write('\n')
                    #print(b)
            elif not a['isLongText']:
                b = {'text': pyq(a['text']).text(), 'date': a['created_at']}
                with open('text.txt', 'a', encoding='utf-8') as f:
                    f.write(str(b))
                    f.write('\n')
                #print(b)
    else:
        print("用户不存在或已注销...")
Ejemplo n.º 3
0
	def dir_parse(self,page,spider_list,result_list):
		print page
		doc = pyq(page)
		tmp = doc('table[class=tableList]')
		trl = tmp('tr')
		for v in trl:
			td= pyq(v)('td[class=title]')
			a = td('a')
			name =  a.text().encode("UTF-8").decode("UTF-8")
			ename =""
			print name
			if len(name)>1:	
				for uchar in name:
					#print uchar
					if  is_alphabet(uchar) :
						ename += uchar
					#elif uchar =='.' or uchar ==' ' or uchar =='&':
						#ename += uchar
					elif (uchar =='(' or is_number(uchar) ) and len(ename)>2:
						break
				print "xxxx",ename

				link =  "http://banyungong.net/" + a.attr('href')
				result_list.append((ename.lower() +"," +link).encode("UTF-8"))
		

		return ""
Ejemplo n.º 4
0
 def get_posts(self, fid, multi=False, index=2, size=100):
     if multi:
         for index in range(index, size, 1):
             time.sleep(random.randint(20, 60))
             url = self.forumUrl + "thread-htm-fid-{fid}-page-{page}.html".format(
                 fid=fid, page=index)
             req = urllib2.Request(url)
             content = urllib2.urlopen(req).read().decode('gbk')
             doc = pyq(content)
             data = doc("#threadlist").children(".nthtr3").items()
             for item in data:
                 pid = item.children(".subject").attr("id").split("td_")[1]
                 print "当前的帖子id: %s" % pid
                 self.pids.append(pid)
                 time.sleep(random.randint(30, 200))
                 self.get_post(pid)
                 time.sleep(random.randint(20, 200))
                 self.reply(pid)
     else:
         time.sleep(random.randint(20, 60))
         url = self.forumUrl + "thread-htm-fid-{fid}.html".format(fid=fid)
         req = urllib2.Request(url)
         content = urllib2.urlopen(req).read().decode('gbk')
         doc = pyq(content)
         for item in doc("#threadlist").children(".nthtr3").items():
             pid = item.children(".subject").attr("id").split("td_")[1]
             print "当前的帖子id: %s" % pid
             self.pids.append(pid)
             time.sleep(random.randint(30, 200))
             self.get_posts(pid)
             time.sleep(random.randint(20, 300))
             self.reply(pid)
Ejemplo n.º 5
0
	def run(self):
		headers = {'connection': 'close'}
	 	response = requests.get(self.url, headers=headers)
	 	response.encoding = 'utf-8' 
	 	column_jq = pyq(response.text)
	 	column = column_jq('title').text()

		parsed_body = html.fromstring(response.text)
		song_urls = parsed_body.xpath('//a[contains(@href, "/play/")]/@href')
		new_lyrics = []

		for song_url in song_urls:
			full_url = urlparse.urljoin("http://www.9ku.com", song_url)   # base_url ahead
			r = requests.get(full_url, headers=headers)
			r.encoding = 'utf-8'   # refer to test/get_chinese.py
			jq = pyq(r.text)
			# get title, author in song page
			brief = jq('h2#play_title').text()
			title = brief.split(' ')[1]
			author = brief.split(' ')[3]
			# two types of song pages
			if jq('div.lrcBox').text():
				content = jq('div.lrcBox').text()
			else:
				out_url = jq('h2#play_title').find('a').eq(2).attr('href')
				r_out = requests.get(out_url, headers=headers)
				r_out.encoding = 'utf-8'   # maybe dno't need
				jq_out = pyq(r_out.text)
				content = jq_out('div.ciInfo').eq(0).text()

			new_lyric = Lyric2(column=column, title=title, author=author,
				content=content)
			new_lyric.save()
		
			print 'get data from %s at %s' % (full_url, time.ctime())
Ejemplo n.º 6
0
	def getPages(self):
		dirs=[]
		#doc=pyq(self.url)
		#while doc("div:contains('Browse Problems')+div+table img").attr('alt')=="FOLDER" and (None in dirs[p].values()):
			#dirs[p].update(dict.fromkeys([self.root+'/'+a.attr('href') for a in doc("div:contains('Browse Problems')+div+table a")]))
			#for d,c in dirs[p].items():
		dirs.append(self.url)
		while dirs:
			curdir=dirs.pop()
			try:
				doc=pyq(curdir)
			except (httplib.IncompleteRead,urllib2.URLError):
				print "Bug!!!!!!!!!!!!!1"
				httplib.HTTPConnection._http_vsn = 10
				httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
				doc=pyq(curdir)
				#httplib.HTTPConnection._http_vsn = 11
				#httplib.HTTPConnection._http_vsn_str = 'HTTP/1.1'
			if doc("div:contains('Browse Problems')+div+table img").attr('alt')=="FOLDER":
				print "[folder]",curdir
				links=doc("div:contains('Browse Problems')+div+table a")
				for a in links:
					dirs.append(self.root+'/'+pyq(a).attr('href'))
			else:
				print "[file]",curdir
				self.pages.append(curdir)
Ejemplo n.º 7
0
    def _fetch_query(self, url, page=0):
        print "-" * 10, " Fetch Page %s " % (page + 1), "-" * 10
        print url

        try:
            html = urllib2.urlopen(url).read()
        except urllib2.HTTPError as e:
            if e.code == 429:
                print "#" * 10, " 429 Too many request.Sleep %s seconds. " % self._too_many_request_sleep, "#" * 10
                eventlet.sleep(self._too_many_request_sleep)
                return self._fetch_query(url, page)
            raise e
        jq = pyq(html)

        urls = []

        user_list = jq(".user-list-item")
        for i in user_list:
            name = pyq(i).find(".user-list-info a")
            href = self._domain + name.attr("href")

            urls.append(href)

        users = []
        for user in pool.imap(self._fetch_user, urls):
            users.append(user)

        if page == 0:
            max_page_index = jq(".next_page").prev("a").text()
            users.extend(self._fetch_query_by_page(url, int(max_page_index)))

        return users
Ejemplo n.º 8
0
def main():
    doc = pyq(filename='html.txt')
    doc1 = doc('div')
    doc2 = doc1('a')
    # print(doc2)
    TieBaDate = {}

    try:
        f = open('source.txt', 'w')
    except IOError:
        print("Error: open file failed.")
    iSum = 0
    for i in doc2:
        tmphref = pyq(i).attr('href')
        tmptitle = pyq(i).attr('title')
        strhref = repr(tmphref)
        strtitle = repr(tmptitle)
        aryhref = re.findall('/p/(\d+)', strhref)

        if re.findall('/p/(\d+)', strhref) != [] and re.findall('(.*?)魔枪(.*?)', strtitle) != []:
            # print(strtitle)
            # print(strhref)
            strsource = 'http://tieba.baidu.com/p/%s' % aryhref[0]
            f.write(strsource)
            f.write("\n")
            iSum += 1
            AnalyHtml(url=strsource, filePath='')
            break

    print('sum :', iSum)
    f.close()
Ejemplo n.º 9
0
    def _fetch_user(self, url):
        try:
            html = urllib2.urlopen(url + "?tab=repositories").read()
        except urllib2.HTTPError as e:
            if e.code == 429:
                print "#" * 10, " 429 Too many request.Sleep %s seconds. " % self._too_many_request_sleep, "#" * 10
                eventlet.sleep(self._too_many_request_sleep)
                return self._fetch_user(url)
            raise e
        jq = pyq(html)

        data = {}
        data['url'] = url
        data['name'] = jq(".vcard-fullname").text()
        data['avatar'] = jq(".avatar").attr("src")
        data['location'] = jq("[aria-label='Home location']").attr("title")
        data['email'] = jq("[aria-label=Email] a").text()
        data['website'] = jq("[aria-label='Blog or website'] a").text()
        data['join'] = jq(".join-date").attr("datetime")
        data['followers'] = jq(".vcard-stat-count:eq(0)").text()
        data['starred'] = jq(".vcard-stat-count:eq(1)").text()
        data['following'] = jq(".vcard-stat-count:eq(2)").text()

        data['repositories'] = {}
        sources = jq(".repo-list-item.source")
        data['repositories']['source_count'] = len(sources)
        data['repositories']["source_lang"] = {}
        for i in sources:
            lang = pyq(i).find("[itemprop='programmingLanguage']").text()
            data['repositories']["source_lang"].setdefault(lang, 0)
            data['repositories']["source_lang"][lang] += 1

        return data
Ejemplo n.º 10
0
def get_jiandan_mm_pic(page_num):
    url = 'http://jandan.net/ooxx/page-' + str(page_num)
    html = pyq(url)
    print('reading ...  http://jandan.net/ooxx/page-{0}\n'.format(page_num))
    sys.stdout.flush()
    #print(html)

    hash_pic_message = {}
    #获取图片地址
    for element in html('li div div.row div.text'):
        img = pyq(element).find('img')
        #img = pyq(element)('img')
        if img != None:
            id = pyq(element)('span a').text()
            #id = id.replace("vote-","")
            hash_pic_message[id]={}
            hash_pic_message[id]['ID']=id
            hash_pic_message[id]['URL']=[]
            hash_pic_message[id]['FileName']=[]

            if img.attr('org_src') == None:
                for t in img:
                    url = img(t).attr('src')
                    hash_pic_message[id]['URL'].append(url)
                    hash_pic_message[id]['FileName'].append(get_file_name2(url))
            else:
                for t in img:
                    url = img(t).attr('org_src')
                    hash_pic_message[id]['URL'].append(url)
                    hash_pic_message[id]['FileName'].append(get_file_name2(url))

    #获取图片ID和评级
    for element in html('li div div.row div.jandan-vote'):
        id = pyq(element)('a').attr('data-id')
        #id = id.replace("vote-","")

        vote = pyq(element).text()

        reg_vote = 'OO \[ (\d.*) \] XX \[ (\d.*) \]'
        pattern = re.compile(reg_vote)
        result = pattern.findall(vote)
        if result != None:
            support = result[0][0]
            unsupport = result[0][1]
            hash_pic_message[id]["Support"] = support
            hash_pic_message[id]["UnSupport"] = unsupport

            if unsupport != "0":
                scale = float(support) / float(unsupport)
            else:
                scale = 0.0
            rank = get_scale(scale)
            hash_pic_message[id]["Scale"] = scale
            hash_pic_message[id]["Rank"] = rank


    for value in hash_pic_message.values():
        #print(value)
        pass
    return hash_pic_message.values()
Ejemplo n.º 11
0
def getHtmlByPyquery(tUrl):
    posts =[]
    from pyquery import PyQuery as pyq
    r = requests.get(tUrl)
    doc=pyq(r.text)
    lis = doc(".car-monthlisting li a")
    lis = lis[0:100]
    lis.reverse()
    i=1
    for li in lis:
        link = pyq(li).attr("href")
        title =  pyq(li).text()
        print "抓取文章_%s(%s,link:%s)" %(i,title,link)
        ir = requests.get(link)
        idoc = pyq(ir.text)
        content = idoc("#content .entrybody").remove(".wumii-hook").remove("script").remove("ins").remove(".ds-thread").remove("#ds-ssr").remove("div").remove("#comments").html()
        content = content.replace("\"","\"\"");
        #print content
        post = Post()
        post.category = urllib.quote("notes") + ":段子"
        post.post_author = "geekzone"
        post.post_title = title
        post.post_content = "\""+content+"\""
        posts.append(post)
        i=i+1
    return posts
Ejemplo n.º 12
0
def exportText(section, idx, link):
    #    url = "http://book.kanunu.org/book3/6630/115916.html"
    #    req = urllib2.Request(url)
    #    response = urllib2.urlopen(req).read()
    fileName = section + "/" + idx + ".html"
    textFile = open(fileName)
    mainHtml = textFile.read()
    textFile.close()
    html = unicode(mainHtml, "GBK")
    doc = pyq(html)
    tables = doc.find("table")
    a = []
    for table in tables:
        a.append(len(pyq(table).text()))
    mx = max(a)
    textIdx = a.index(mx)
    titleIdx = textIdx - 1
    mainText = pyq(tables[textIdx]).find("p").html()
    #    afterTitle = mainText.index(r"<br/>")
    #    mainTitle = mainText[0:afterTitle].replace(u" ", "").replace(u"】", "").replace(u"【", "").strip().encode("UTF-8")
    #    mainTitle = pyq(tables[titleIdx]).text().replace(u"上部 ", "").replace(u"中部 ", "").replace(u"下部 ", "").encode("UTF-8")
    mainTitle = pyq(tables[titleIdx]).text().encode("UTF-8")
    outFile = open("Text/" + section + "/" + idx + ".xhtml", "w")
    outFile.write("<h1>" + mainTitle + "<h1/>")
    # outFile.write("<p>")
    outFile.write(mainText.encode("UTF-8"))
    # outFile.write("<p/>")
    outFile.write("<p><br/>" + link + "<br/><p/>")
    outFile.close()
    titleList.append(mainTitle)
    return mainTitle
Ejemplo n.º 13
0
def getSrc(url):
    text = getInfo(url)
    doc = pyq(text)
    cts = doc('.thumb_mov_model').find('a')
    for i in cts:
        link = pyq(i).attr('href')
        src = pyq(getInfo(link))('#example_video_1').find('source').attr('src')
        yield src
Ejemplo n.º 14
0
def main():
    url = 'http://taiwan.huanqiu.com/news/'
    #url = 'http://world.huanqiu.com/observation/'
    #url = 'http://china.huanqiu.com/politics/'
    doc = pyq(url=url)
    alist = doc('.pad20 li a')
    for a in alist:
        link = pyq(a).attr('href')
        get_info(link)
Ejemplo n.º 15
0
 def get_pdf_url(self, url_list):
     pdf_list =[]
     print(url_list)
     for i in url_list:  # 進去每個報告的網頁,找到pdf的連結網址
         r = requests.get(i)
         doc = pyq(r.content.decode('gbk'))
         pdf = doc('a').filter(lambda j, this: 'PDF' in pyq(this).text()).eq(0)
         pdf_list.append(pdf.attr['href'])
     return pdf_list
Ejemplo n.º 16
0
def main():
    url = 'http://taiwan.huanqiu.com/news/'
    #url = 'http://world.huanqiu.com/observation/'
    #url = 'http://china.huanqiu.com/politics/'
    doc = pyq(url=url)
    alist = doc('.pad20 li a')
    for a in alist:
        link = pyq(a).attr('href')
        get_info(link)
Ejemplo n.º 17
0
    def get_proxy_list(self, page_range=15):
        __all_proxy_list =[]
        for __page in range(page_range):
            __url = 'http://proxylist.hidemyass.com/%s#listable' % __page
            __request = urllib2.Request(__url, headers=self.__headers)
            __response = urllib2.urlopen(__request)
            __the_page = __response.read()
            doc = pyq(__the_page)

            for __list_idx in doc('#listable tbody>tr')[:]:
                __tmp = doc(__list_idx).outerHtml()
                p = pyq(__tmp)
                for __j in p('style').text().split('\n'):
                   if __j.find('display:none')>0:
                      p.remove(__j.split('{')[0])

                p.remove('style')

                for __j in p('span,div'):
                   if p(__j).attr('style')=='display:none':
                      p(__j).remove()

                __proxy = {'last_update' : p('td').eq(0).text(),
                           'ip_address' : p('td').eq(1).text().replace(' ',''),
                           'port' : p('td').eq(2).text(),
                           'country' : p('td').eq(3).text(),
                           'countryIsoCode' : p('td').eq(3).attr('rel'),
                           'type': p('td').eq(6).text(),
                           'anon' : p('td').eq(7).text(),
                           'speed': ''.join( re.findall(u'\d', p('td').eq(4)('.indicator').attr('style').split(';')[0]) ),
                           'connection_time': ''.join( re.findall(u'\d', p('td').eq(4)('.indicator').attr('style').split(';')[0]) )
                           }
                print __proxy
                __all_proxy_list.append(__proxy)

        pickle.dump(__all_proxy_list, open('free_proxy_list', 'wb'))
        __all_proxy_list = pickle.load(open('free_proxy_list' , 'r'))
        return __all_proxy_list

        all_count_cnt = {}
        for __i in __all_proxy_list:
            if all_count_cnt.has_key(__i['country']):
                all_count_cnt[__i['country']] = all_count_cnt[__i['country']]+1
            else:
                all_count_cnt[__i['country']] = 1

        print all_count_cnt

        all_count_cnt = {}
        for __i in __all_proxy_list:
            if all_count_cnt.has_key(__i['countryIsoCode']):
                all_count_cnt[__i['countryIsoCode']] = all_count_cnt[__i['countryIsoCode']]+1
            else:
                all_count_cnt[__i['countryIsoCode']] = 1

        print all_count_cnt
Ejemplo n.º 18
0
    def _parse_data(self, pyq_node, k, data, debug):
        keymap = []
        path = data['path']
        pathlist = path.split(',')
        node = pyq_node
        for p in pathlist:
            if 'attr@' in p:
                attr = p[5:]
                value = node.attr(attr)
                return value
            elif 'text' == p:
                if node.text() != None:
                    value = node.text().encode("utf-8")
                else:
                    value = None
                return value
            elif '#' in p:
                pl = p.split('#')
                #print pl[0],pl[1]
                node = node(pl[0].encode("utf-8")).eq(int(pl[1]))
                if node != None:
                    node = pyq(node)
                else:
                    return None
            else:

                node = node(p.encode("utf-8"))
                if node != None:
                    #node = pyq(node)(p)
                    node = pyq(node)
                else:
                    return None
            if debug:
                print "DEBUG,p", p
                print node


#        for key in data:
#            if key != 'path':
#                keymap[k]=[]
#                break;
        if len(node) > 0:
            if debug:
                print "DEBUG", k
                print node
            for d in node:

                submap = {}
                for key in data:
                    if key != 'path':
                        res = self._parse_data(pyq(d), key, data[key], debug)
                        submap[key] = res
                keymap.append(submap)

        return keymap
Ejemplo n.º 19
0
    def _parse_data(self,pyq_node,k,data,debug):
        keymap =[]
        path = data['path']
        pathlist = path.split(',')
        node = pyq_node
        for p in pathlist:
            if 'attr@' in p:
                attr = p[5:]
                value= node.attr(attr) 
                return value
            elif 'text' == p:
                if node.text() != None:
                    value = node.text().encode("utf-8")
                else:
                    value = None
                return value
            elif '#' in p:
                pl = p.split('#')    
                #print pl[0],pl[1]
                node = node(pl[0].encode("utf-8")).eq(int(pl[1]))
                if node !=None:
                    node = pyq(node)
                else:
                    return None
            else:
                
                node = node(p.encode("utf-8"))
                if node!=None:
                #node = pyq(node)(p)
                    node = pyq(node)
                else:
                    return None
            if debug:
                print "DEBUG,p",p
                print node

        
#        for key in data:
#            if key != 'path':
#                keymap[k]=[]
#                break;
        if len(node )> 0: 
            if debug:
                print "DEBUG",k
                print node
            for d in node:
                
                submap ={}
                for key in data:
                    if key != 'path':
                        res = self._parse_data(pyq(d),key,data[key],debug)
                        submap[key] = res
                keymap.append(submap)
            
        return keymap
Ejemplo n.º 20
0
 def getRound(self):
     homepyq = pyq(self.rsp)
     uri = homepyq('ul.lpage_race_nav.clearfix').children().eq(1).find(
         'a').attr('href')
     url = self.pagehost + uri
     rsp = urllib.urlopen(url).read()
     jfpyq = pyq(rsp)
     countRnd = jfpyq('ul.lsaiguo_round_list.clearfix').children().length
     self.rnd = jfpyq('ul.lsaiguo_round_list.clearfix').children(
         'li.on').find('a').text()
     return countRnd
Ejemplo n.º 21
0
    def parse(self, url):
        # 解析第一页商品列表
        res = requests.get(url)
        assert res.status_code == 200
        jq = pyq(res.content)

        goods_list = jq('.list-container>ul>li>a')
        for r in goods_list:
            goods_url = r.get('href')
            if not goods_url:
                continue
            goods_url = '%s%s' % (CosstoresGoodsListPrase.COSSTORES_HOST, goods_url)
            goods_name = r.get('title')
            #  print goods_url, goods_name

            goods_item = {
                'url' : goods_url,
                'name' : goods_name,
            }
            self.goods_list.append(goods_item)

        # 解析ajax动态请求的商品列表页,第2-n页
        next_page = jq('#infiload_nav>a')
        if next_page:
            next_page = next_page[0]
            max_page = int(next_page.get('data-maxpage'))
            next_url = next_page.get('href')
            np = re.findall('page=(\d+)', next_url)
            if not np:
                return
            np = int(np[0])
            while np <= max_page:
                next_url = re.sub('page=(\d+)', 'page=%s' % (np), next_url)
                np += 1
                res = requests.get('%s%s' % (CosstoresGoodsListPrase.COSSTORES_HOST, next_url))
                assert res.status_code == 200
                jq_page = pyq(res.content)
                goods_list = jq_page('li>a')
                if not goods_list:
                    # 解析完了
                    break
                for r in goods_list:
                    goods_url = r.get('href')
                    if not goods_url:
                        continue
                    goods_url = '%s%s' % (CosstoresGoodsListPrase.COSSTORES_HOST, goods_url)
                    goods_name = r.get('title')
                    goods_item = {
                        'url' : goods_url,
                        'name' : goods_name,
                    }
                    self.goods_list.append(goods_item)
Ejemplo n.º 22
0
def getFunctions(url):
    apihost = 'file:///opt/Aldebaran Robotics/Choregraphe Suite 2.1/share/doc/naoqi/'
    if url == '#':
        return
    url = apihost + url
    doc = pyq(urllib2.urlopen(url).read())
    for nodefunction in doc('.function'):
        func = pyq(pyq(nodefunction).children()[0])
        funcName = func('.descname').text()
        module = func('.descclassname').text().split('::')[0].split('Proxy')[0]
        params = []
        for param in func('em'):
            params.append(pyq(param).text())
        if not codes.has_key(module):
            codes[module] = ''
            codes[module] += 'from naoqi import ALProxy\n'
            codes[module] += 'from network.const import PORT\n'
            codes[module] += 'from lib.cmd_parser import Cmd\n'
            codes[module] += 'from lib.elementParser import parse\n\n'
            codes[
                module] += 'proxy = ALProxy(\'' + module + '\', \'127.0.0.1\', PORT)\n\n'
            codes[module] += 'def proceed(cmd):\n'
            codes[module] += '\tcmd = Cmd(cmd)\n'
            codes[module] += '\tcmd.removeHead()\n'
            codes[
                module] += '\tprint \'going to function:\', cmd.getCommand()\n'
            codes[module] += '\tfunc = globals().get(cmd.getCommand())\n'
            codes[module] += '\tif func:\n'
            codes[module] += '\t\treturn func(cmd.getValues(\'p\'))\n'
            codes[module] += '\telse:\n'
            codes[
                module] += '\t\tprint \'Error: Cannot find command:\' + cmd.getCommand()\n'
            codes[
                module] += '\t\treturn \'Error: Cannot find command:\' + cmd.getCommand()\n\n'
        codes[module] += 'def ' + funcName + '(params):\n'
        if params:
            codes[module] += '\tif len(params) < ' + str(len(params)) + ':\n'
            codes[
                module] += '\t\tprint \'Error: function \\\'' + funcName + '\\\' takes 2 params\'\n'
            codes[
                module] += '\t\treturn \'Error: function \\\'' + funcName + '\\\' takes 2 params\'\n'
        for i in range(len(params)):
            codes[module] += '\t' + params[i] + ' = parse(params[' + str(
                i) + '])\n'
        codes[module] += '\treturn proxy.' + funcName + '('
        if params:
            codes[module] += params[0]
            for i in range(1, len(params)):
                codes[module] += ',' + params[i]
        codes[module] += ')\n\n'
Ejemplo n.º 23
0
    def get_betting_odds_info_list(self):
        h = requests.get(self.url, timeout = self.timeout) #, proxies = self.proxies
        text = h.content
        pq = pyq(text)
        betting_odds_info_list = []
        startdate_html = pq('.event-holder.holder-scheduled>.eventLine.status-scheduled')
        url_html = pyq(startdate_html)('meta[itemprop=\'url\']')
        matchup_html = pyq(startdate_html)('meta[itemprop=\'name\']')
        for i in range(len(startdate_html)):
            betting_odds_info_list.append({'start_time': startdate_html.eq(i).attr('rel'),
                                       'url': url_html.eq(i).attr('content'),
                                        'away_team': matchup_html.eq(i).attr('content').split(' vs ')[0],
                                       'home_team': matchup_html.eq(i).attr('content').split(' vs ')[1]})

        return betting_odds_info_list
Ejemplo n.º 24
0
def get_page_book_info(url, book):
    html = pyq(url)
    next_link = None

    print('reading ...  {0}\n'.format(url))
    sys.stdout.flush()

    #获取图书信息
    for element in html('ul.list li.o'):
        o_img = pyq(element)('div.o-img')
        o_info = pyq(element)('div.o-info')

        link = o_img('a').attr('href')
        img_src = o_img('img').attr('src')
        o_name = pyq(element)('h3.o-name a').text()
        o_author = pyq(element)('p.o-author a').text()
        o_ext = pyq(element)('p.o-ext').text()
        o_cate = pyq(element)('p.o-cate a').text()
        o_data = pyq(element)('p.o-data i.icon').text()
        t_temp = o_data.split(" ")
        if t_temp != None:
            o_click = t_temp[0]
            o_download = t_temp[1]
        print(o_name, o_author, link, img_src, o_ext, o_cate, o_click,
              o_download)
        sys.stdout.flush()

        index = len(book) + 1
        book[index] = {}
        book[index]["Index"] = index
        book[index]["Name"] = o_name
        book[index]["Author"] = o_author
        book[index]["Tag"] = o_cate
        book[index]["EXT"] = o_ext
        book[index]["Link"] = link
        book[index]["Picture"] = img_src
        book[index]["Click_Number"] = o_click
        book[index]["Download_Number"] = o_download

    #获取页面中下一页链接
    for link in html('ul.paging li a'):
        if pyq(link).text() == '下一页':
            next_link = pyq(link).attr('href')

    if next_link != None:
        return book, next_link
    else:
        return book, None
Ejemplo n.º 25
0
    def on_parse(self,resp):
        if resp.code == 200:
            body = resp.body.decode('gbk')
            doc = pyq(body)
            items = doc('#r table')
            lst = []
            for item in items:
                _item = pyq(item)
                _news = {}
                _news['title'] = _item('.text b').text()
                _news['source'] = _item('font nobr').html()
                _news['body'] = _item('font[size="-1"]').text()
                _news['url'] = pyq(_item('.text a')[0]).attr('href')
                lst.append(_news)

            self._callback(lst)
Ejemplo n.º 26
0
	def getProblemMax(self):
		self.getVolumeCount()
		
		d = pyq(url = self.baseUrl + self.voluemPath + str(self.voluemCount))
		self.problemMax = int(d('#content_body > form:nth-child(1) > table > tr:last-child > td.problemId > a > font').text())
		self.problemCount = self.problemMax - 1001 + 1
		return self.problemMax
Ejemplo n.º 27
0
    def fromLeagueidGetTeamid2(self):
        
        url_ = self.urldomain + "/league/%s/jifen" % self.leagueid

        send_headers = {
            'Host': 'www.dszuqiu.com',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Connection': 'keep-alive'
        }
        req = urllib2.Request(url_,headers=send_headers)
        htm_rsp = urllib2.urlopen(req).read()
        mpyq = pyq(htm_rsp)
        trs = mpyq('table.live-list-table tbody tr')
        teamlist = []
        
        for i in range(trs.length):

            tds = trs.eq(i).find('td')
            if tds.length > 0:
                jq_as = tds.eq(2).find('a')
                print jq_as.text()
                ttid = jq_as.eq(0).attr('href').split('/')[-1]
                if ttid in teamlist:
                    continue
                teamlist.append(ttid)
                print ttid + " " + jq_as.eq(0).text()
                self.team_dict.setdefault(ttid, jq_as.eq(0).text())
        return self.team_dict
Ejemplo n.º 28
0
	def parsePost(self, response):

		def filterRule(url):
			if '/wp-content/uploads/' in url:
				return url 

		d = pyq(response.body)
		post = {
			"url": response.url,
			"title": d('h1.entry-title').text(),
			"category": response.css('span.cat-links > a::text').extract()[0],
			"datetime": response.css('time.entry-date::text').extract()[0],
			"author": response.css('span.vcard > a::text').extract()[0],
			"content":  d('div.entry-content').text(),
			"img": filter(filterRule, response.css('img::attr(src)').extract()),
		}
		self.postcollection.update({"url": post['url']}, post, True)

		'''
		the scheduler of yield here is different from that in tornado or twisted,
		it will call `next()` immediately, rather than the IO has completed
		so just use yield, it is still in parallel 
		'''
		for url in post['img']:
			yield Request(url, callback=self.saveImage)
Ejemplo n.º 29
0
    def on_parse(self, resp):
        if resp.code == 200:
            body = resp.body.decode('gbk')
            doc = pyq(body)
            items = doc('#r table')
            lst = []
            for item in items:
                _item = pyq(item)
                _news = {}
                _news['title'] = _item('.text b').text()
                _news['source'] = _item('font nobr').html()
                _news['body'] = _item('font[size="-1"]').text()
                _news['url'] = pyq(_item('.text a')[0]).attr('href')
                lst.append(_news)

            self._callback(lst)
Ejemplo n.º 30
0
	def parseThread(self, response):
		url = response.url.replace('http://bbs', 'http://www')
		reply = []
		for floor in response.css('div.tpc_content').extract():
			reply.append(pyq(floor).text())

		self.collection.update({"url": response.url}, {'$set': {"reply": reply}}, True)
Ejemplo n.º 31
0
def tongji(headers, url, begin, end):
	'''
	用于统计某座楼的指定楼层间的id发帖数并由多到少排序
	'''
	links = [ url+"%d" %i for i in range(int(begin),int(end) + 1 ) ]
	mydict = {}

	for url in links:
		response, content = http.request(url, 'GET', headers=headers)

		doc = pyq(content)
		
		for i in range(13,77,7):
			if doc("tr").eq(8).text().decode('utf-8') == "提示:本主题启用了“允许发言针对特定用户”功能,您可以单击“回复主题”创建针对特定用户的回复,或单击每一楼层的“答复”按钮快速创建该楼层发表者才可见的回复。":
				i += 1

			try:
				name = doc("tr").eq(i)
				s = name.text().decode('utf-8')
				# print s,
				if not s in mydict:
					mydict[s] = 1
				else:
					mydict[s] += 1
			except BaseException:
				pass

	delstr = "管理选项 : 修复 | 解锁 | 提升 | 下沉 | 删除 | 移动 | 高亮 | 固顶 | 总固顶 | 区固顶 | 解除保存 |"
	delstr = delstr.decode('utf-8')
	if delstr in mydict:
		del mydict[delstr.decode('utf-8')]

	mydict = sorted(mydict.iteritems(), key=itemgetter(1), reverse=True)

	return mydict
Ejemplo n.º 32
0
    def get(self, template_variables={}):
        url = self.get_argument("url", "")
        template_variables["tmall_link"] = url
        template_variables["static_path"] = self.static_path

        if is_weixin_browser(self) or is_mobile_browser(self):
            tmall_pattern = re.compile(
                r'http://detail.tmall.com/item.htm?\S*id=(\d+)')
            tmall_match = tmall_pattern.search(url)
            if tmall_match:
                sku = tmall_match.group(1)

                doc = pyq(
                    "http://djaa.cn/ajax/cm_details/to_cm_details_tmall.php",
                    headers={
                        'User-Agent':
                        'Mozilla/5.0 (MicroMessenger;iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4'
                    },
                    method='post',
                    data={
                        'id': sku,
                        'shopUrl': url,
                        'shop_type': 'tmall',
                        'small_shop_type': 'cm_details'
                    })
                #print doc
                title = doc('.dtif-h').text()
                content = doc('.viewport').outerHtml()
                template_variables["title"] = title
                template_variables["content"] = content
                template_variables["sku"] = sku
                self.render(self.template_path + "tmall.html",
                            **template_variables)
        else:
            self.redirect(url)
Ejemplo n.º 33
0
 def getTeamPageData(self, url):
     print 'url:%s' % url
     send_headers = {
         'Host': 'www.dszuqiu.com',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         'Connection': 'keep-alive'
     }
     req = urllib2.Request(url, headers=send_headers)
     tmp_rsp = urllib2.urlopen(req).read()
     if tmp_rsp != '' or tmp_rsp != None:
         mpyq = pyq(tmp_rsp)
         sq_tables = mpyq('section.content.active table.live-list-table'
                          )  #可能会有2个table,未开始和已结束
         target_tb = sq_tables.eq(-1)
         # trs = mpyq('section.content.active table.live-list-table tbody tr')   #error
         trs = target_tb.find('tbody tr')
         for i in range(trs.length):
             tds = trs.eq(i).children(
             )  #英冠 2016/10/29 22:00 [10] 利茲 v 伯顿阿尔比恩 [15] ( -0.5 / 2.0,2.5 / 10.5 ) - -- - -- - -- - -- 析
             if tds.eq(10).text().find('-') == -1:
                 tmplist = [tds.eq(j).text() for j in range(tds.length)]
                 self.teamData.append(tmplist)
Ejemplo n.º 34
0
def getStat(mid):
    import gzip
    import StringIO
    url_stat = "http://odds.500.com/lq/stat.php?id=" + mid
    request = urllib2.Request(url_stat)
    request.add_header('Accept-encoding', 'gzip')
    opener = urllib2.build_opener()
    mFile = opener.open(request)
    isGzip = mFile.headers.get('Content-Encoding')
    if isGzip == 'gzip':
        compresseddata = mFile.read()
        compressedstream = StringIO.StringIO(compresseddata)
        gzipper = gzip.GzipFile(fileobj=compressedstream)
        stat_rsp = gzipper.read()
    else:
        stat_rsp = mFile.read()

    stpyq = pyq(stat_rsp)

    subSceList = []

    # 后面两个tr才是单节比分项
    tr_a = stpyq('tr#bf_away')
    tr_h = stpyq('tr#bf_home')
    tds_a = tr_a.find('td')
    tds_h = tr_h.find('td')

    # td项去头去尾
    for i in range(tds_a.length - 2):
        tmpSubSce = ' %s-%s' % (tds_a.eq(i + 1).text(), tds_h.eq(i + 1).text())
        subSceList.append(tmpSubSce)
        # print tmpSubSce
    return subSceList
Ejemplo n.º 35
0
def login():
  #get_login_url
  jw2005_url = 'http://jw2005.scuteo.com/'
  response = request(url=jw2005_url)
  #http://222.201.132.117/(nouuvu55yi1bpk45tz3rhkjy)/default2.aspx

  array = response.geturl().split("/")
  array[4] = "default6.aspx"
  login_url = "/".join(array)
  #http://222.201.132.117/(nouuvu55yi1bpk45tz3rhkjy)/default6.aspx

  doc = pyq(url=login_url)
  viewstate = doc("#Form1 input")[0].value

  id = 123
  passowrd = 123
  values = {
  '__VIEWSTATE' : viewstate,
  # tname:
  # tbtns:
  'tnameXw':"yhdl",
  'tbtnsXw':"yhdl|xwxsdl",
  'txtYhm':id,
  # txtXm:
  'txtMm':passowrd,
  "rblJs":"",
  "btnDl":""#.decode('gbk').encode('gbk'),
  }
  headers = {}
  response, response_cookie = request(login_url, values, headers, True)
  return response.geturl()
Ejemplo n.º 36
0
def AnalyHtml(url,filePath):
    if filePath != '':
        pass
    else:

        htl = pyq(url = url)
        htl2 = htl('cc')
        for i in htl2:
            htl3 = pyq(i)
            htl4 = htl3('div')
            if htl4.find('img'):
                # print(htl4)
                print(htl4('img').attr('src'))
            else:
                # print(htl3('div').text())
                pass
Ejemplo n.º 37
0
 def __init__(self, forumUrl, userName, password, proxy=None):
     ''' 初始化论坛url、用户名、密码和代理服务器 '''
     self.forumUrl = forumUrl
     self.userName = userName
     self.password = password
     self.formhash = ''
     self.isLogon = False
     self.isSign = False
     self.xq = ''
     self.postOldData = {}
     self.get_post_form_data = {}
     self.jar = cookielib.CookieJar()
     self.pids = []
     self.get_reply_content = [
         u"顶[s:53] [s:53] ", u"菱湖人顶个贴", u"[s:48] [s:48] [s:48] 顶顶",
         u"老菱湖人来顶顶帖子", u"混个脸熟[s:48] [s:48]",
         u"[s:89] [s:89] [s:89] [s:89] ", u"[s:77] [s:77] [s:77] 菱湖人路过",
         u"[s:53][s:53]顶[s:53]", u"顶顶顶[s:53][s:53]",
         u"[s:53]路过[s:53][s:53]", u"走走看看[s:53][s:53]",
         u"老菱湖人看看[s:53][s:53]", u"有没有菱湖的[s:53]"
     ]
     if not proxy:
         openner = urllib2.build_opener(
             urllib2.HTTPCookieProcessor(self.jar))
     else:
         openner = urllib2.build_opener(
             urllib2.HTTPCookieProcessor(self.jar),
             urllib2.ProxyHandler({'http': proxy}))
     urllib2.install_opener(openner)
     req = urllib2.Request(forumUrl + "/login.php?")
     content = urllib2.urlopen(req).read()
     doc = pyq(content)
     for item in doc("form").children("input").items():
         self.postOldData[item.attr("name")] = item.val()
Ejemplo n.º 38
0
def login():
    #get_login_url
    jw2005_url = 'http://jw2005.scuteo.com/'
    response = request(url=jw2005_url)
    #http://222.201.132.117/(nouuvu55yi1bpk45tz3rhkjy)/default2.aspx

    array = response.geturl().split("/")
    array[4] = "default6.aspx"
    login_url = "/".join(array)
    #http://222.201.132.117/(nouuvu55yi1bpk45tz3rhkjy)/default6.aspx

    doc = pyq(url=login_url)
    viewstate = doc("#Form1 input")[0].value

    id = 123
    passowrd = 123
    values = {
        '__VIEWSTATE': viewstate,
        # tname:
        # tbtns:
        'tnameXw': "yhdl",
        'tbtnsXw': "yhdl|xwxsdl",
        'txtYhm': id,
        # txtXm:
        'txtMm': passowrd,
        "rblJs": "",
        "btnDl": ""  #.decode('gbk').encode('gbk'),
    }
    headers = {}
    response, response_cookie = request(login_url, values, headers, True)
    return response.geturl()
Ejemplo n.º 39
0
def getPlayerInfo(playerid):
    html = getHtml('www.csgola.com', '/player/' + playerid)
    q = pyq(html)
    avatar = q('img.avatar.center-block.img-responsive').attr('src')
    playername = q('.personaname').text()
    statTit = q('.col-md-10 .title').text().encode('utf-8')
    statVal = q('.col-md-10 .datala').text().encode('utf-8')
    chartVal = q('.polar-detail .datala').text().encode('utf-8')
    json = {
        'error': 0,
        'playerinfo': {
            'avatar': avatar,
            'name': playername,
        },
        'stats': {
            'jishashu':       statVal[0],
            'baotoulv':       statVal[1],
            'kd':             statVal[2],
            'shenglv':        statVal[3],
            'zhengwangshu':   statVal[4],
            'mingzhonglv':    statVal[5],
            'juanzengwuqi':   statVal[6],
            'mvpcishu':       statVal[7],
        },
        'chart': {
            'zonghe':         chartVal[0],
            'kd':             chartVal[1],
            'mingzhonglv':    chartVal[2],
            'baotoulv':       chartVal[3],
            'shenglv':        chartVal[4],
        },
    }
    return json
def movie_links_range(year, index):
	base_url="https://movie.douban.com/tag";
	resource_url = "%s/%d?start=%d&type=T" %(base_url, year, index);
	rtree = pyq(url=resource_url);
	print resource_url;
	items = rtree('.nbg');
	rst_list = [];
	##for idx in range(1):
	for idx in range(len(items)):
		link = items.eq(idx).attr('href');
		title = items.eq(idx).attr('title').encode("UTF-8");
		rst_list.append((link, title))
		pass ; 

##	actors_list = [];
##	items = rtree('.item');
##	##print items;
##	for idx in range(len(items)):
##		actors = items.eq(idx)('td')[1].find_class('pl')[0].text_content().encode("utf-8");
##		si = actors.rfind(") /");
##		if -1 != si:
##			actors = actors[si+3:];
##		actors_list.append(actors);

	movie_detail_list = [];
	for item in rst_list:
		movie_detail = get_movie_detail_by_link(item[0]);
		if movie_detail is not None:
			movie_detail["link_info"] = item;
			movie_detail_list.append(movie_detail);
		pass ;

	return movie_detail_list; 
Ejemplo n.º 41
0
    def kvartira(self, response):
        pyquery = pyq(url=response.url)
        dt = Handler.today(resp_date=response.doc('div.item_title').text().split(']')[0].replace('[', ''))
        number = re.compile(r'\w+.+?(\d+)')
        text_nomer = response.doc('.content span.date_update').text().replace('\t','').replace('\n', '')
        number = number.findall(text_nomer)
        content = {name.text_content().encode('latin1').decode('utf8').replace("\t", '').replace("\n", ''):
                   pyquery("tr td.bold + td").eq(i).text().encode('latin1').decode('utf8')
                   for i, name in enumerate(pyquery.find("td.bold"))}
        contact = {name.text_content().encode('latin1').decode('utf8').replace("\t", '').replace("\n", ''):
                   pyquery("tr td.caption + td").eq(i).text().encode('latin1').decode('utf8')
                   for i, name in enumerate(pyquery.find("td.caption"))}
        content.update({
            "url": response.url,
            "Крошки": response.doc('div.content  div.item_title  span.bold').text(),
            "Дата парсинга": str(date.today()),
            "Обьявление":pyquery.find("div.content table tbody tr td[colspan]").text().encode('latin1').decode('utf8'),
            "Номер обьявления": number[0],
            "Дата публикации": dt,
            "Фото": Handler.url_photo(response.doc('div.thumb-carousel div.thumb a.image_thumb').items()),
            "Путь скриншота": selen.screen(response.url),

        })
        content.update(contact)
        return content
Ejemplo n.º 42
0
def get_url():
    __site = 'https://www.ptt.cc'
    __req_url = __site + '/bbs/Tech_Job/index.html'
    __idx = 1
    all_url = []
    while True:
        #print "this page1 = %s" % __req_url
        try:
            __response = urllib2.urlopen(__req_url, timeout=9999)
            __the_page = __response.read()
            doc = pyq(__the_page)

        except:
            continue

        doc.make_links_absolute(base_url=__site)

        for __i in doc('div.title a'):
            #print doc(__i).text()
            #print 'https://www.ptt.cc' + doc(__i).attr('href')
            all_url.append(doc(__i).attr('href'))

        __req_url = doc('.btn.wide').eq(1).attr('href')

        __idx += 1

        if __idx > 2:
            break

        if __req_url is None:
            break
    return all_url
Ejemplo n.º 43
0
 def parse(self, response):
     l = ItemLoader(item=Problem(), response=response)
     d = pyq(response.body)
     l.add_value('id', response.url[-4:])
     l.add_value('title', d('#content_body > center:nth-child(1) > span').text())
     l.add_value('body', d('#content_body').text())
     return l.load_item()
Ejemplo n.º 44
0
	def parsePost(self, response):

		def filterRule(url):
			if '/attachment/' in url:
				return url 

		d = pyq(response.body)
		post = {
			"url": response.url,
			"title": response.css('#passage-title::text').extract()[0],
			"category": response.css('div.list-title-word::text').extract()[0],
			"datetime": response.css('#passage-info::text').extract()[0].split(' | ')[0],
			"hit": response.css('#passage-info::text').extract()[0].split(' | ')[1],
			"detail":  d('#passage-detail').text(),
			"img": filter(filterRule, response.css('img::attr(src)').extract()),
		}

		self.collection.update({"url": post['url']}, post, True)

		'''
		the scheduler of yield here is different from that in tornado or twisted,
		it will call `next()` immediately, rather than the IO has completed
		so just use yield, it is still in parallel 
		'''
		for url in post['img']:
			yield Request(url, callback=self.saveImage)
Ejemplo n.º 45
0
def translate():
    url = r'http://dict.youdao.com/search?q=' + sys.argv[1]
    doc = pyq(url)
    doc('#custheme').remove()
    doc('.c-topbar').remove()
    doc('.c-subtopbar').remove()
    doc('.c-header').remove()
    doc('#c_footer').remove()
    doc('.c-bsearch').remove()
    doc('#ads').remove()
    doc('#rel-search').remove()
    doc('.error-wrapper').remove()
    doc('#topImgAd').remove()
    doc('#container').css('margin', '0')
    doc('#container').css('width', '500px')
    doc('#results').css("margin-left", "20px")
    doc('#results-contents').css('width', '480px')
    doc('#results-contents').css('margin', '0')
    doc('#result_navigator').css('left', '380px')
    #result_navigator不能删掉,否则无法切换解释Tab
    doc('#result_navigator').css('display', 'none')
    for a in doc('a'):
        href = a.get('href')
        if href is not None and href.startswith('/'):
            a.make_links_absolute('http://dict.youdao.com')

    link = u"<a href='" + url + u"'>在浏览器中查看翻译</a>"
    doc('#results-contents').append(link)
    print doc.outerHtml()
Ejemplo n.º 46
0
    async def postemail(self, pe_url, mailform, getconfig, getauth):
        _LOGGER.info("Sending email form...")
        postemail = await self.execRequest({
            "url": pe_url,
            "params": mailform,
            "headers": {
                "accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.82",
                "origin": getconfig["issuer"],  # "issuer" from config
                "accept-language": "de-de",
                "user-agent":
                "Mozilla/5.0 (iPhone; CPU iPhone OS 13_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148",
                "referer": getauth.url
            },
            "method": "POST"
        })

        _LOGGER.info("Done!")
        _LOGGER.info("Parsing email form response...")
        pqpe = pyq(postemail.text)
        pwform = dict([
            (t.attrib["name"], t.attrib["value"])
            for t in pqpe("#credentialsForm").find("[type='hidden']")
        ])
        pwform["password"] = self.config["password"]

        ppwurl = getconfig["issuer"] + pqpe(
            "#credentialsForm")[0].attrib["action"]

        _LOGGER.info("Done!")
        await self.postpw(ppwurl, pwform, getconfig, postemail)
Ejemplo n.º 47
0
 def __init__(self,dest):
     self.url = dest
     doc = pyq(url = self.url)
     records = doc('.list-img')
     for record in records:
         houseInfo = HouseInfo()
         titleLine = record.cssselect('.list-info-title')[0];
         houseInfo.title = self.getAttr(titleLine,'title')
         houseInfo.link = '%s%s' % (r'http://bj.ganji.com',self.getAttr(titleLine,'href'))
         district = record.cssselect('.list-word-col')[0]
         houseInfo.district = district.text_content()
         subway = record.cssselect('.list-word-col')[1]
         houseInfo.subway = subway.text_content()
         dist = record.cssselect('.list-word')[0]
         houseInfo.distance = dist.text_content().split('-')[-1].decode('utf-8')
         #(houseInfo.kind,houseInfo.area,houseInfo.decoration,houseInfo.floor,houseInfo.direction) = record.cssselect('.list-word')[1].text_content().split('/')
         props = record.cssselect('.list-word')[1].text_content().split('/')
         houseInfo.kind = props[0]
         houseInfo.area = props[1]
         houseInfo.kind = props[2]
         houseInfo.floor = props[3]
         houseInfo.direction = props[4]            
         self.houses.append(houseInfo)
         #print(record.cssselect('.list-word')[1].text_content())
     for house in self.houses:
         print(house.title)
Ejemplo n.º 48
0
	def parsePost(self, response):
		def analysys(response):
			try:
				d = pyq(response.css('div#artibody').extract()[0])
				data = {
					"url": response.url,
					"title": response.css('h1#artibodyTitle::text').extract()[0],
					"body": d.text(),
					"date": response.css('span#pub_date::text').extract()[0],
					"parsed": "1"
				}
				return data

			except IndexError, e:
				pass

			try:
				d = pyq(response.css('td[valign="top"]').extract()[2])
				data = {
					"url": response.url,
					"title": response.css('font[size="5"]::text').extract()[0],
					"body": d.text(),
					"date": response.css('font[face="Arial"]').extract()[0],
					"parsed": "2"
				}
				return data

			except IndexError, e:
				pass
Ejemplo n.º 49
0
    def getNbySingle(self, teamid, teamname):
        url = 'http://liansai.500.com/team/' + str(teamid)
        print url
        headers = {
            'Host': 'liansai.500.com',
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Accept': 'text/html, */*; q=0.01',
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36',
            'Accept-Encoding': 'gzip,deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8',
        }
        rdata = None
        req = urllib2.Request(url, rdata, headers)
        rsp = urllib2.urlopen(req)
        if rsp.info().get('Content-Encoding') == 'gzip':
            buf = StringIO(rsp.read())
            f = gzip.GzipFile(fileobj=buf)
            data = f.read()

        mypqy = pyq(data)
        mtbody = mypqy('table.lcur_race_list tbody')
        trs = mtbody.find('tr')
        singleRd_list = []

        for i in range(5):
            tds = trs.eq(i).find('td')
            singleRd_list.append(
                [teamname.encode('gbk'),
                 tds.eq(5).text().encode('gbk')])
            print '%s %s' % (teamname, tds.eq(5).text())
        return singleRd_list
Ejemplo n.º 50
0
def get_url():
    __site = 'https://www.ptt.cc'
    __req_url = __site + '/bbs/Tech_Job/index.html'
    __idx= 1
    all_url = []
    while True:
        #print "this page1 = %s" % __req_url
        try:
            __response = urllib2.urlopen(__req_url, timeout=9999)
            __the_page = __response.read()
            doc = pyq(__the_page)

        except:
            continue

        doc.make_links_absolute(base_url=__site)

        for __i in doc('div.title a'):
            #print doc(__i).text()
            #print 'https://www.ptt.cc' + doc(__i).attr('href')
            all_url.append(doc(__i).attr('href'))

        __req_url = doc('.btn.wide').eq(1).attr('href')

        __idx += 1

        if __idx > 2:
            break

        if __req_url is None:
            break
    return all_url
Ejemplo n.º 51
0
    def parse(self, url):
        res = requests.get(url)
        assert res.status_code == 200
        jq = pyq(res.content)
        self.url = url
        self.price = jq('.PriceContainer').text()
        self.color = jq('.colorLabel').text()
        self.name = jq('.productInfo>h1').text()
        category_id = re.findall('/(\d+)-', url)
        self.category_id = category_id[0] if category_id else ''
        images = jq('.productSlideshow>ul>li>div>img')
        image_list = []
        for r in images:
            image_url = r.get('src')
            if not image_url:
                continue
            image_list.append('%s%s' % (CosstoresGoodsPrase.COSSTORES_HOST, image_url))
        self.image = image_list
        first_image = image_list[0] if image_list else ''
        goods_id = re.findall('/(\d+)/', first_image)
        self.goods_id = str(goods_id[0]) if goods_id else ''

        # ajax动态请求
        goods_detail_ids = jq('.productSizes>label>input')
        goods_detail_id = goods_detail_ids[0].get('value') if goods_detail_ids else ''
        if goods_detail_id:
            goods_detail_url = 'http://www.cosstores.com/gb/product/GetVariantData?variantId=%s&lookID=null&image=0' % (goods_detail_id)
            res = requests.get(goods_detail_url)
            assert res.status_code == 200
            result = res.json()
            self.code = result.get('HMOrderNo', '')
            self.original_price = result.get('DefaultPriceWithCurrency', '')
            self.price = result.get('PriceWithCurrency', '')
            self.attributes = result.get('Attributes', [])
            self.details = result.get('DescriptionShort', '')
Ejemplo n.º 52
0
 def getMatchid_cl(self, lgid):
     mypyq = pyq(self.rsp)
     trs_euro = mypyq('table.lcur_race_list tbody tr')
     for i in range(trs_euro.length):
         tds = trs_euro.eq(i).find('td')
         matid = tds.eq(6).find('a').attr('href').split('-')[-1].split(
             '.')[0]
         self.mmatchid_list.append(matid)
Ejemplo n.º 53
0
def request_weixin(query ,qtype=1):
  url = 'http://weixin.sogou.com/weixin?type=%d&query=%s'
  doc = pyq(url=(url % (qtype, query)))
  weixin_list = doc(".results>div").items()
  for item in weixin_list:
    openid = item.attr['href'][12:]
    name = item(".txt-box>h3").text()
    weixin_num = item(".txt-box>h4>span").text()[4:]
    print(name + ": " + weixin_num + " " + openid)
Ejemplo n.º 54
0
 def get_proxy_list(self):
     r = requests.get(gPROXY, timeout=5)
     doc = pyq(r.content)
     tr_list = [i.text() for i in doc('#proxylisttable').find('tr').items()]
     tr_list = [i for i in tr_list[1:] if i]
     proxy_list = [i.split('/n')[:2] for i in tr_list]
     proxy_list = [':'.join(i) for i in proxy_list]
     proxy_list = [{'http': i, 'https': i} for i in proxy_list]
     return proxy_list
Ejemplo n.º 55
0
 def dir_parse(self,page,spider_list,result_list):
     #print page
         doc = pyq(page)
         tmp = doc('div[class=article]')
         tl = tmp('tr[class=item]')
         #print tl
         for tr in tl:
             dl = pyq(tr)('div[class=pl2]')
                 #print dl
             a = dl('a')
             print a.attr('href')
             result_list.insert(0,a.attr('href'))
         next = doc('span[class=next]') 
         a = next('a').attr('href')
         if  a is not None and len(a)>5:
             print a.encode("UTF-8")
             spider_list.append(a.encode("UTF-8"))
         return 	
Ejemplo n.º 56
0
 def format_html(self, html):
     if html == None and html == '':
         return ''
     try:
         doc = pyq(html)
         text = doc.text()
         result = ' '.join(text.replace('\n', ' ').split())
     except Exception, e:
         result = html
Ejemplo n.º 57
0
 def get_citys(self, fid=0):
     url = self._url.format(fid)
     doc = pyq(url=url)
     text = doc.text()[21:-1]
     try:
         return json.loads(text)
     except:
         print text
         return []
Ejemplo n.º 58
0
    def getPreviousSingle(self, matchid):
        url = 'http://odds.500.com/fenxi/shuju-' + str(matchid) + '.shtml'
        #     url = 'http://live.500.com/'
        print url
        #         rsp = urllib.urlopen(url).read()
        headers = {
            'Host': 'odds.500.com',
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Accept': 'text/html, */*; q=0.01',
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36',
            'Accept-Encoding': 'gzip,deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8',
        }
        rdata = None
        req = urllib2.Request(url, rdata, headers)
        rsp = urllib2.urlopen(req)
        if rsp.info().get('Content-Encoding') == 'gzip':
            buf = StringIO(rsp.read())
            f = gzip.GzipFile(fileobj=buf)
            data = f.read()
#         print "rsp:%s" % data
        mypyq = pyq(data)

        trs = mypyq('body div#team_jiaozhan tr[fid]:gt(0)')
        singleRd_list = []
        print trs

        #从index=2开始
        if trs.length > 3:
            mlen = 3
        else:
            mlen = trs.length

        for i in range(mlen):
            tds = trs.eq(i).find('td')
            singleRd_list.append([
                ''.join(tds.eq(1).text().split('-')),
                re.sub('\[\d+\]\s', '',
                       tds.eq(2).find('span.dz-l').text()).encode('gbk'),
                ' ' + ''.join(tds.eq(2).find('em').text().split(' ')),
                re.sub('\s\[\d+\]', '',
                       tds.eq(2).find('span.dz-r').text()).encode('gbk')
            ])

            print '%s %s %s %s' % (''.join(tds.eq(1).text().split('-')),
                                   re.sub('\[\d+\]\s', '',
                                          tds.eq(2).find('span.dz-l').text()),
                                   ''.join(
                                       tds.eq(2).find('em').text().split(' ')),
                                   re.sub('\s\[\d+\]', '',
                                          tds.eq(2).find('span.dz-r').text()))
        return singleRd_list
Ejemplo n.º 59
0
def getname(headers):
    '''
	从发帖排行找到ID,然后得到生日,QQ,并存入文本。具体作用自己分析
	'''
    url = "http://www.cc98.org/toplist.asp?orders=1&page="

    file = open('id.txt', 'w+')
    links = [url + "%d" % i for i in xrange(100, 150)]

    for link in links:
        response, content = http.request(link, 'GET', headers=headers)
        soup = BeautifulSoup(content)
        get = soup.findAll("td", {"class": "tablebody1"})

        for i in xrange(0, 100, 5):
            userid = get[i]
            endurl = userid.find("a")['href']
            idurl = "http://www.cc98.org/" + endurl
            idresponse, idcontent = http.request(idurl, 'GET', headers=headers)
            idcontent = BeautifulSoup(idcontent)
            name = get[i].find("a").string
            name = pyq(name).text()

            # 输入ID
            file.write(name)

            bir = idcontent.findAll("td", {"class": "tablebody1"},
                                    {"style": "line-height:150%"})[-1]
            for br in bir.findAll('br'):
                next = br.nextSibling
                beg = next.find("生 日: ")
                if beg == 0:
                    pyear = next.find(" 年")
                    pmon = next.find(" 月")
                    pday = next[pmon:].find(" 日")
                    year = next[beg + 5:pyear]
                    mon = next[pyear + 2:pmon]
                    day = next[pmon + 2:pday + pmon]
                    if year != '':
                        # print "year = " + year
                        # print "mon = " + mon
                        # print "day = " + day

                        # 输入生日
                        file.write(" : " + year + mon + day)

                beg = next.find("QQ : ")
                if beg == 0:
                    qq = next[beg + 5:]
                    # print qq
                    # 输入QQ
                    file.write(" : " + qq + '\n')

    file.close()