def scrapyDetail(self, detailurl, proxies, id): headers = { 'Connection': 'close', "user-agent": userAgents[random.randrange(0, len(userAgents))] } req = s.get(detailurl, headers=headers, proxies=proxies, timeout=2) htmlStr = req.text ele = etree.HTML(htmlStr) content_ele = ele.xpath("//div[@class='post-content']/div")[0].xpath( "p") content_str = "<div>" for ele_item in content_ele: content_str = content_str + str(etree.tostring( ele_item, encoding="utf-8", method="HTML", pretty_print=True), encoding="utf8") content_str + '</div>' try: Photography = session.query(PhotographyM).filter( PhotographyM.id == id).all() if len(Photography) != 0: Photography[0].content = content_str session.commit() except InvalidRequestError: print("更新Error: %r" % InvalidRequestError) session.rollback() except Exception as err: print("Mysql2未知Error: %r" % err) session.rollback()
def savePic(self, imgcover, proxies, id): basedir = os.path.dirname(__file__) hostpath = parse.urlparse(imgcover) imgpathlist = hostpath[4].split("/") imgname = imgpathlist[len(imgpathlist) - 1].split("&")[0] imgpath = os.path.abspath( os.path.join(basedir, '..', '..', 'static', 'photography', "/".join(imgpathlist[3:5]))) headers = { 'Connection': 'close', "user-agent": userAgents[random.randrange(0, len(userAgents))] } if not os.path.exists(imgpath): print("路径不存在,正在创建路径~~~~~~") os.makedirs(imgpath) try: res = s.get(imgcover, headers=headers, proxies=proxies, timeout=2) with open(os.path.join(imgpath, imgname), "wb") as fp: fp.write(res.content) fp.close() try: query = session.query(PhotographyM).filter( PhotographyM.id == id).all()[0] imgurl = '/static/photography/{0}/{1}'.format( "/".join(imgpathlist[3:5]), imgname) query.imgurlstr = imgurl session.commit() except InvalidRequestError as err: print("InvalidRequestError %r" % repr(err)) session.rollback() except Exception as e: print("Exception %r" % repr(e)) session.rollback() except Exception as err: print("图片下载Error:{0}".format(err))
def startProxyIP(url,n): url=url.format(n) headers={ "user-agent":"User-Agent:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", } try: req = requests.get(url, headers=headers) restext = req.text ele=etree.HTML(restext) tr_dom_list=ele.xpath("//table[@id='ip_list']//tr") for index in range(len(tr_dom_list)): tr_item=tr_dom_list[index] if index != 0: td_dom_list=tr_item.xpath("./td[not(@class)]") if len(td_dom_list) != 0: ipStr=td_dom_list[0].xpath('./text()')[0] port=td_dom_list[1].xpath("./text()")[0] address=td_dom_list[2].xpath("./a/text()")[0] type1=td_dom_list[3].xpath("./text()")[0] time1=td_dom_list[4].xpath("./text()")[0] time2=td_dom_list[5].xpath("./text()")[0] proxyip=ProxyIpM(ipStr,port,address,type1,time1,time2) session.add(proxyip) session.commit() if n<10: n+=1 startProxyIP(url,n) except ConnectionError: print(repr(ConnectionError))
def saveMysql(self, title, author, desc): try: insert_sql = PhotographyM(title, author, desc) session.add(insert_sql) session.commit() return insert_sql.id except InvalidRequestError: print("插入Error: %r" % InvalidRequestError) session.rollback() except Exception as err: print("Mysql3未知Error: %r" % err) session.rollback()
def scrapyDetail(self, url, proxies): headers = { "user-agent": userAgents[random.randrange(0, len(userAgents))] } try: urlStr = url + "?tag=热门&from=gaia_video" res_html = requests.get(urlStr, headers=headers, proxies=proxies, timeout=2).text res_html = res_html.encode("utf-8", 'ignore') ele = etree.HTML(res_html) title = ele.xpath('//span[@property="v:itemreviewed"]/text()')[0] fType = "/".join(ele.xpath('//span[@property="v:genre"]/text()')) director = "/".join( ele.xpath( '//span[@class="attrs"]/a[@rel="v:directedBy"]/text()')) performer = "/".join( ele.xpath( '//span[@class="actor"]/span[@class="attrs"]/a[@rel="v:starring"]/text()' )) score = "/".join( ele.xpath('//strong[@property="v:average"]/text()')) releaseDate = ",".join( ele.xpath('//span[@property="v:initialReleaseDate"]/text()')) timelen = "/".join( ele.xpath('//span[@property="v:runtime"]/text()')) introduce = ";".join( ele.xpath('//span[@property="v:summary"]/text()')) try: Films = FilmsM(title, fType, director, performer, score, releaseDate, timelen, introduce) session.add(Films) session.commit() return Films.id except InvalidRequestError as err: print("InvalidRequestError %r" % repr(err)) session.rollback() except Exception as e: print("Exception %r" % repr(e)) session.rollback() except Exception as err: print("爬取详情Error:{0}".format(err))
def downloadImg_s(self, imgcover, id, proxies): basedir = os.path.dirname(__file__) hostpath = parse.urlparse(imgcover) imgpathlist = hostpath[2].split("/") imgname = imgpathlist[len(imgpathlist) - 1] imgpathlist.remove(imgname) filepath = os.path.join(basedir, '..', '..', 'static', '/'.join(imgpathlist)[1:]) headers = { "user-agent": userAgents[random.randrange(0, len(userAgents))] } if not os.path.exists(filepath): print("路径不存在,正在创建路径~~~") os.makedirs(filepath) try: res = requests.get(imgcover, headers=headers, proxies=proxies, timeout=2) with open(os.path.join(filepath, imgname), "wb") as fp: fp.write(res.content) fp.close() try: query = session.query(FilmsM).filter( FilmsM.id == id).all()[0] imgurl = '/static/{0}/{1}'.format( '/'.join(imgpathlist)[1:], imgname) query.fimgurl = imgurl session.commit() except InvalidRequestError as err: print("InvalidRequestError %r" % repr(err)) session.rollback() except Exception as e: print("Exception %r" % repr(e)) session.rollback() except Exception as err: print("图片下载Error:{0}".format(err)) # ScrapyFilms('https://movie.douban.com/j/search_subjects?type=movie&tag=可播放&sort=rank&playable=on&page_limit=20&page_start={0}')