예제 #1
0
 def scrapyDetail(self, detailurl, proxies, id):
     headers = {
         'Connection': 'close',
         "user-agent": userAgents[random.randrange(0, len(userAgents))]
     }
     req = s.get(detailurl, headers=headers, proxies=proxies, timeout=2)
     htmlStr = req.text
     ele = etree.HTML(htmlStr)
     content_ele = ele.xpath("//div[@class='post-content']/div")[0].xpath(
         "p")
     content_str = "<div>"
     for ele_item in content_ele:
         content_str = content_str + str(etree.tostring(
             ele_item, encoding="utf-8", method="HTML", pretty_print=True),
                                         encoding="utf8")
     content_str + '</div>'
     try:
         Photography = session.query(PhotographyM).filter(
             PhotographyM.id == id).all()
         if len(Photography) != 0:
             Photography[0].content = content_str
             session.commit()
     except InvalidRequestError:
         print("更新Error: %r" % InvalidRequestError)
         session.rollback()
     except Exception as err:
         print("Mysql2未知Error: %r" % err)
         session.rollback()
예제 #2
0
 def savePic(self, imgcover, proxies, id):
     basedir = os.path.dirname(__file__)
     hostpath = parse.urlparse(imgcover)
     imgpathlist = hostpath[4].split("/")
     imgname = imgpathlist[len(imgpathlist) - 1].split("&")[0]
     imgpath = os.path.abspath(
         os.path.join(basedir, '..', '..', 'static', 'photography',
                      "/".join(imgpathlist[3:5])))
     headers = {
         'Connection': 'close',
         "user-agent": userAgents[random.randrange(0, len(userAgents))]
     }
     if not os.path.exists(imgpath):
         print("路径不存在,正在创建路径~~~~~~")
         os.makedirs(imgpath)
     try:
         res = s.get(imgcover, headers=headers, proxies=proxies, timeout=2)
         with open(os.path.join(imgpath, imgname), "wb") as fp:
             fp.write(res.content)
             fp.close()
             try:
                 query = session.query(PhotographyM).filter(
                     PhotographyM.id == id).all()[0]
                 imgurl = '/static/photography/{0}/{1}'.format(
                     "/".join(imgpathlist[3:5]), imgname)
                 query.imgurlstr = imgurl
                 session.commit()
             except InvalidRequestError as err:
                 print("InvalidRequestError %r" % repr(err))
                 session.rollback()
             except Exception as e:
                 print("Exception %r" % repr(e))
                 session.rollback()
     except Exception as err:
         print("图片下载Error:{0}".format(err))
예제 #3
0
def startProxyIP(url,n):
    url=url.format(n)
    headers={
        "user-agent":"User-Agent:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    }
    try:
        req = requests.get(url, headers=headers)
        restext = req.text
        ele=etree.HTML(restext)
        tr_dom_list=ele.xpath("//table[@id='ip_list']//tr")
        for index in range(len(tr_dom_list)):
            tr_item=tr_dom_list[index]
            if index != 0:
                td_dom_list=tr_item.xpath("./td[not(@class)]")
                if len(td_dom_list) != 0:
                    ipStr=td_dom_list[0].xpath('./text()')[0]
                    port=td_dom_list[1].xpath("./text()")[0]
                    address=td_dom_list[2].xpath("./a/text()")[0]
                    type1=td_dom_list[3].xpath("./text()")[0]
                    time1=td_dom_list[4].xpath("./text()")[0]
                    time2=td_dom_list[5].xpath("./text()")[0]
                    proxyip=ProxyIpM(ipStr,port,address,type1,time1,time2)
                    session.add(proxyip)
                    session.commit()
                    if n<10:
                        n+=1
                        startProxyIP(url,n)

    except ConnectionError:
        print(repr(ConnectionError))
예제 #4
0
 def saveMysql(self, title, author, desc):
     try:
         insert_sql = PhotographyM(title, author, desc)
         session.add(insert_sql)
         session.commit()
         return insert_sql.id
     except InvalidRequestError:
         print("插入Error: %r" % InvalidRequestError)
         session.rollback()
     except Exception as err:
         print("Mysql3未知Error: %r" % err)
         session.rollback()
예제 #5
0
 def scrapyDetail(self, url, proxies):
     headers = {
         "user-agent": userAgents[random.randrange(0, len(userAgents))]
     }
     try:
         urlStr = url + "?tag=热门&from=gaia_video"
         res_html = requests.get(urlStr,
                                 headers=headers,
                                 proxies=proxies,
                                 timeout=2).text
         res_html = res_html.encode("utf-8", 'ignore')
         ele = etree.HTML(res_html)
         title = ele.xpath('//span[@property="v:itemreviewed"]/text()')[0]
         fType = "/".join(ele.xpath('//span[@property="v:genre"]/text()'))
         director = "/".join(
             ele.xpath(
                 '//span[@class="attrs"]/a[@rel="v:directedBy"]/text()'))
         performer = "/".join(
             ele.xpath(
                 '//span[@class="actor"]/span[@class="attrs"]/a[@rel="v:starring"]/text()'
             ))
         score = "/".join(
             ele.xpath('//strong[@property="v:average"]/text()'))
         releaseDate = ",".join(
             ele.xpath('//span[@property="v:initialReleaseDate"]/text()'))
         timelen = "/".join(
             ele.xpath('//span[@property="v:runtime"]/text()'))
         introduce = ";".join(
             ele.xpath('//span[@property="v:summary"]/text()'))
         try:
             Films = FilmsM(title, fType, director, performer, score,
                            releaseDate, timelen, introduce)
             session.add(Films)
             session.commit()
             return Films.id
         except InvalidRequestError as err:
             print("InvalidRequestError %r" % repr(err))
             session.rollback()
         except Exception as e:
             print("Exception %r" % repr(e))
             session.rollback()
     except Exception as err:
         print("爬取详情Error:{0}".format(err))
예제 #6
0
    def downloadImg_s(self, imgcover, id, proxies):
        basedir = os.path.dirname(__file__)
        hostpath = parse.urlparse(imgcover)
        imgpathlist = hostpath[2].split("/")
        imgname = imgpathlist[len(imgpathlist) - 1]
        imgpathlist.remove(imgname)
        filepath = os.path.join(basedir, '..', '..', 'static',
                                '/'.join(imgpathlist)[1:])
        headers = {
            "user-agent": userAgents[random.randrange(0, len(userAgents))]
        }
        if not os.path.exists(filepath):
            print("路径不存在,正在创建路径~~~")
            os.makedirs(filepath)
        try:
            res = requests.get(imgcover,
                               headers=headers,
                               proxies=proxies,
                               timeout=2)
            with open(os.path.join(filepath, imgname), "wb") as fp:
                fp.write(res.content)
                fp.close()
                try:
                    query = session.query(FilmsM).filter(
                        FilmsM.id == id).all()[0]
                    imgurl = '/static/{0}/{1}'.format(
                        '/'.join(imgpathlist)[1:], imgname)
                    query.fimgurl = imgurl
                    session.commit()
                except InvalidRequestError as err:
                    print("InvalidRequestError %r" % repr(err))
                    session.rollback()
                except Exception as e:
                    print("Exception %r" % repr(e))
                    session.rollback()
        except Exception as err:
            print("图片下载Error:{0}".format(err))


# ScrapyFilms('https://movie.douban.com/j/search_subjects?type=movie&tag=可播放&sort=rank&playable=on&page_limit=20&page_start={0}')