def craw(url, page):
    html1 = urllib.request.urlopen(url).read()
    html1 = str(html1)
    print(html1)
    pat1 = '<div id="plist".+? <div class="page clearfix">'
    result1 = retest.compile(pat1).findall(html1)
    print("=============")
    print(result1)
    result1 = result1[0]
    print("=============")
    print(result1)
    pat2 = '<img width="220" height="220" data-img="1" src="//(.+?\.jpg)">'
    imagelist = retest.compile(pat2).findall(result1)
    print("=============????????")
    print(imagelist)
    x = 1
    for imageurl in imagelist:
        imagename = "E:/learn/GitHub/pythoncrawl/img1/" + str(page) + str(
            x) + '.jpg'
        imageurl = "http://" + imageurl
        try:
            urllib.request.urlretrieve(imageurl, filename=imagename)
        except urllib.error.URLError as e:
            if hasattr(e, "code"):
                x += 1
            if hasattr(e, "reason"):
                x += 1
        x += 1
Exemple #2
0
 def parse(self, response):
     # pass
     item = QtpjtItem()
     paturl="(http://pic.qiantucdn.com/58pic/.*?).jpg"
     item["picurl"] = retest.compile(paturl).findall(str(response.body))
     patlocal = "http://pic.qiantucdn.com/58pic/.*?/.*?/.*?/(.*?).jpg"
     item["picid"] = retest.compile(patlocal).findall(str(response.body))
     yield item
     for i in range(1,201):
         nexturl="http://www.58pic.com/tb/id-"+str(i)+".html"
         yield Request(nexturl,callback=self.parse)
Exemple #3
0
    def parse(self, response):
        # pass
        item = HexunpjtItem()
        item['name'] = response.xpath(
            "//span[@class='ArticleTitleText']/a/text()").extract()
        item["url"] = response.xpath(
            "//span[@class='ArticleTitleText']/a/@href").extract()

        pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)">'
        hcurl = retest.compile(pat1).findall(str(response.body))[0]
        headers2 = (
            "User-Agent",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0"
        )
        opener = urllib.request.build_opener()
        opener.addheaders = [headers2]
        # 将opener安装为全局
        urllib.request.install_opener(opener)
        #data为对应博客列表页的所有博文的点击数与评论数数据
        data = urllib.request.urlopen(hcurl).read()
        #pat2为提取文章阅读数的正则表达式
        pat2 = "click\d*?','(\d*?)'"
        #pat3为提取文章评论数的正则表达式
        pat3 = "comment\d*?','(\d*?)'"
        #提取阅读数和评论数数据并分别赋值给item下的hits和comment
        item["hits"] = retest.compile(pat2).findall(str(data))
        item["comment"] = retest.compile(pat3).findall(str(data))
        yield item
        #提取博文列表页的总页数
        pat4 = "blog.hexun.com/p(.*?)/"
        #通过正则表达式获取到的数据为一个列表,倒数第二个元素为总页数
        data2 = retest.compile(pat4).findall(str(response.body))
        if (len(data2) >= 2):
            totalurl = data2[-2]
        else:
            totalurl = 1
        #在实际运行中,下一行print的代码可以注释掉,在调试过程中,可以开启下一行print的代码
        #print("一共"+str(totalurl)+"页")
        #进入for循环,依次爬取各博文列表页的博文数据
        for i in range(2, int(totalurl) + 1):
            #构造下一次要爬取的url,爬取一下页博文列表页中的数据
            nexturl = "http://" + str(
                self.uid) + ".blog.hexun.com/p" + str(i) + "/default.html"
            #进行下一次爬取,下一次爬取仍然模拟成浏览器进行
            yield Request(
                nexturl,
                callback=self.parse,
                headers={
                    'User-Agent':
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0"
                })
Exemple #4
0
 def run(self):
     page = self.pagestart
     keycode = urllib.request.quote("key")
     pagecode = urllib.request.quote("&queue")
     for page in range(self.pagestart, self.pageend + 1):
         url = "http://weixin.sougou.com/weixin?type=2&query=" + keycode + pagecode + str(
             page)
         data1 = use_proxy(self.proxy, url)
         listurlpat = '<div class="txt_box">.*?(http://.*)"'
         listurl.append(retest.compile(listurlpat, retest.S).findall(data1))
     print("get page " + str(len(listurl)))
     for i in range(0, len(listurl)):
         time.sleep(7)
         for j in range(0, len((listurl[i]))):
             try:
                 url = listurl[i][j]
                 url = url.replace("amp;", "")
                 print("第" + str(i) + "i" + str(j) + "j次入队")
                 self.urlqueue.put(url)
                 self.urlqueue.task_done()
             except urllib.error.URLError as e:
                 if hasattr(e, "code"):
                     print(e.code)
                 if hasattr(e, "reason"):
                     print(e.reason)
                 time.sleep(10)
             except Exception as e:
                 print("excption:" + str(e))
                 time.sleep(1)
Exemple #5
0
def getcontent(listurl, proxy):
    i = 0
    html1 = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
      <html xmlns="http://www.w3.org/1999/xhtml">
      <head>
      <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
      <title>微信文章页面</title>
      </head>
      <body>'''
    fh = open("../1.html", "wb")
    fh.close()
    fh = open("../1.html", "ab")
    for i in range(0, len(listurl)):
        for j in range(0, len(listurl[i])):
            try:
                url = listurl[i][j]
                url = url.replace("amp;", "")
                data = use_proxy(proxy, url)
                titlepat = "<title>(.*?)</title>"
                contentpat = 'id="js_content">(.*?)id="js_sg_bar"'
                title = retest.compile(titlepat).findall(data)
                content = retest.compile(contentpat, retest.S).findall(data)
                thistitle = "此次没有获取到"
                thiscontent = "此次没有获取到"
                if (title != []):
                    thistitle = title[0]
                if (content != []):
                    thiscontent = content[0]
                dataall = "<p>标题为:" + thistitle + "</p><p>内容为:" + thiscontent + "</p><br>"
                fh.write(dataall.encode("utf-8"))
                print("第" + str(i) + "个网页第" + str(j) + "次处理")  #便于调试
            except urllib.error.URLError as e:
                if hasattr(e, "code"):
                    print(e.code)
                if hasattr(e, "reason"):
                    print(e.reason)
            except Exception as e:
                print("exception:" + str(e))
                time.sleep(1)
    fh.close()
    html2 = '''</body>
        </html>
        '''
    fh = open("../1.html", "ab")
    fh.write(html2.encode("utf-8"))
    fh.close()
Exemple #6
0
 def run(self):
     html1 = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
         <html xmlns="http://www.w3.org/1999/xhtml">
         <head>
         <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
         <title>微信文章页面</title>
         </head>
         <body>'''
     fh = open("../2.html", "wb")
     fh.write(html1.encode("utf-8"))
     fh.close()
     fh = open("../2.html", "ab")
     i = 1
     while (True):
         try:
             url = self.urlqueue.get()
             data = use_proxy(self.proxy, url)
             titlepat = "<title>(.*?)</title>"
             contentpat = 'id="js_content">(.*?)id="js_sq_bar"'
             title = retest.compile(titlepat).findall(data)
             content = retest.compile(contentpat, retest.S).findall(data)
             thistitle = "not this time"
             thiscontent = "not this time"
             if (title != []):
                 thistitle = title[0]
             if (content != []):
                 thiscontent = content[0]
             dataall = "<p>title:" + thistitle + "</p><p>content:" + thiscontent + "</p><br>"
             fh.write(dataall.encode("utf-8"))
             print("page" + str(i) + "...")
         except urllib.error.URLError as e:
             if hasattr(e, "code"):
                 print(e.code)
             if hasattr(e, "reason"):
                 print(e.reason)
             time.sleep(10)
         except Exception as e:
             print("exception:" + str(e))
             time.sleep(1)
     fh.close()
     html2 = '''</body>
     </html>
     '''
     fh = open("../2/html", "ab")
     fh.write(html2.encode("utf-8"))
     fh.close()
def getlink(url):
    headers = ("user-agent","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")
    opener=urllib.request.build_opener()
    opener.addheaders=[headers]
    urllib.request.install_opener(opener)
    file=urllib.request.urlopen(url)
    data=str(file.read())
    pat='(https?://[^\s)";]+\.(\w|/)*)'
    link=retest.compile(pat).findall(data)
    link=list(set(link))
    return link
def getcontent(url,page):
    headers = ("user-agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")
    opener=urllib.request.build_opener()
    opener.addheaders=[headers]
    urllib.request.install_opener(opener)
    data=urllib.request.urlopen(url).read().decode("utf-8",  errors='ignore')#这里转码有错误,先忽略吧
    data=str(data)
    # print(data)
    # userpat='target="_blank" title="(.*?)">'
    # < a class ="recmd-content" href="/article/121217166" target="_blank" onclick="_hmt.push(['_trackEvent','web-list-user','chick'])" > 咏哥走了,金大侠也走了,朋友圈又是一波接一波的轰炸。跑保险的借此说大病;做净水器的说喝水的重要性;做保健品的说产品疗效……我说,咏哥买不起保险还是安装不起净水器 < / a >
    userpat="'web-list-user','chick'])\" >(.*?)</a>\""
    # contentpat='<div class="content">(.*?)</div>'
    # contentpat='<div class="content">(.*?)</div>'
    # userlist=re.compile(userpat,re.S).findall(data)
    userlist=retest.compile(userpat, retest.S).findall(data)
    print(userlist)
Exemple #9
0
def parse_one_page(html):
    pattern = retest.compile(
        '<dd>.*?board-index.*?>(\d+)</i>.*?<a.*?src="(.*?)">.*?name"><a' +
        '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' +
        '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', retest.S)
    items = retest.findall(pattern, html)
    print(items)
    for item in items:
        yield {
            'index': item[0],
            'image': item[1],
            'title': item[2].strip(),
            'actor': item[3].strip()[3:] if len(item[3]) > 3 else '',
            'time': item[4].strip()[5:] if len(item[4]) > 5 else '',
            'score': item[5].strip() + item[6].strip()
        }
Exemple #10
0
def getlisturl(key, pagestart, pageend, proxy):
    try:
        page = pagestart
        keycode = urllib.request.quote(key)
        pagecode = urllib.request.quote("&page")
        for page in range(pagestart, pageend + 1):
            url = "http://weixin.sogou.com/weixin?type=2&query=" + keycode + pagecode + str(
                page)
            data1 = use_proxy(proxy, url)
            listurlpat = '<div class="txt-box">.*?(http://.*?)"'
            listurl.append(retest.compile(listurlpat, retest.S).findall(data1))
        print("total" + str(len(listurl)))
        return listurl
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
        time.sleep(0)
    except Exception as e:
        print("exception:" + str(e))
        time.sleep(1)
Exemple #11
0
print(result2)

print("----------")
string = "apythonhellomypythonhispythonourpythonend"
pattern = ".python."
result = retest.match(pattern, string)
result2 = retest.match(pattern, string).span()
print(result)
print(result2)

print("----------")
string = "hellomypythonhispythonourpythonend"
pattern = ".python."
result = retest.match(pattern, string)
result2 = retest.search(pattern, string)
print(result)
print(result2)

print("----------")
string = "hellomypythonhispythonourpythonend"
pattern = retest.compile(".python.")  #预编译
result = pattern.findall(string)  #找出符合模式的所有结果
print(result)
result = retest.compile(".python.").findall(string)
print(result)

print("----------")
pattern = "python."
resultl = retest.sub(pattern, "php", string)  #全部替换
result2 = retest.sub(pattern, "php", string, 2)  #最多替换两次
Exemple #12
0
    headall.append(item)
opener.addheaders = headall
urllib.request.install_opener(opener)


#建立一个自定义函数craw(vid,comid),实现自动抓取对应评论网页并返回抓取数据
def craw(vid, comid):
    url = "http://coral.qq.com/article/" + vid + "/comment?commentid=" + comid + "&reqnum=20"
    data = urllib.request.urlopen(url).read().decode("utf-8")
    return data


idpat = '"id":"(.*?)"'
userpat = '"nick":"(.*?)",'
conpat = '"content":"(.*?)",'
#第一层循环,代表抓取多少页,每一次外层循环抓取一页
for i in range(1, 10):
    print("------------------------------------")
    print("第" + str(i) + "页评论内容")
    data = craw(vid, comid)
    #第二层循环,根据抓取的结果提取并处理每条评论的信息,一页20条评论
    for j in range(0, 20):
        idlist = retest.compile(idpat, retest.S).findall(data)
        userlist = retest.compile(userpat, retest.S).findall(data)
        conlist = retest.compile(conpat, retest.S).findall(data)
        print("用户名是 :" + eval('u"' + userlist[j] + '"'))
        print("评论内容是:" + eval('u"' + conlist[j] + '"'))
        print("\n")
    #将comid改变为该页的最后一条评论id,实现不断自动加载
    comid = idlist[19]