def process_item(self, item, spider): m1 = MyLog() today = time.strftime('%Y%m%d', time.localtime()) fileName = 'weather' + today + '.json' m1.error('转换json开始') with codecs.open(fileName, 'a', encoding='utf8') as fp: line = json.dumps(dict(item), ensure_ascii=False) + '\n' fp.write(line) m1.warn('转换json结束') return item
#!/usr/bin/env python # -*- coding: utf-8 -*- from myLog import MyLog if __name__ == '__main__': ml = MyLog() ml.debug("I am the debug message") ml.info("I am the info message") ml.warn("I am the warn message") ml.error("I am the error message") ml.critical("I am the critical message")
#!/usr/bin/env python #-*- coding: utf-8 -*- __author__ = 'hstking [email protected]' from myLog import MyLog if __name__ == '__main__': ml = MyLog() ml.debug('I am debug message') ml.info('I am info message') ml.warn('I am warn message') ml.error('I am error message') ml.critical('I am critical message')
class DownloadYinyuetaiMv(object): def __init__(self): clear() self.tip() self.log = MyLog() self.title = 'unknow' self.packageSize = 1024*1024 self.mvPlayUrl = self.getMvPlayUrl() def getMvPlayUrl(self): '''获取音乐台mv的播放地址 ''' self.log.info('获取mv的播放地址') self.mvPlayUrl = raw_input('输入音乐台中MV的播放地址\n如http://v.yinyuetai.com/video/615494:\n') self.checkMvPlayUrl(self.mvPlayUrl) def checkMvPlayUrl(self,url): '''检查输入的mv播放地址是否有效 ''' self.log.info('检查mv播放地址') try: id = url.replace('http://v.yinyuetai.com/video/','') idNum = int(id) except ValueError: self.log.error('输入的mv播放地址有误,退出程序') res = urllib2.urlopen(url,timeout=5) mat = re.compile(r'<h3 class="fl f18">(.*?)</h3>') self.title = re.findall(mat,res.read())[0] print('MV:%s' %self.title) downUrl = self.getMvDownloadUrl(id) self.downloadMv(downUrl) def getMvDownloadUrl(self,id): '''获取mv的下载地址 ''' self.log.info('获取mv下载地址') url = 'http://www.yinyuetai.com/insite/get-video-info?flex=true&videoId=' + id try: res = urllib2.urlopen(url,timeout=5) except: self.log.error('网页连接错误') mat = re.compile(r'http://h.?.yinyuetai.com/uploads/videos/common/.*?\.flv') urls = re.findall(mat,res.read()) return urls[-1] def downloadMv(self,url): '''开始下载mv ''' fileName = './' + self.title + '.mp4' res = urllib2.urlopen(url,timeout=5) self.log.info('开始下载MV %s' %fileName) rSize = int(dict(res.headers).get('content-length')) t1 = time.time() with open(fileName,'wb') as fp: st = res.read(self.packageSize) offset = 0 while st: fp.write(st) st = res.read(self.packageSize) offset += len(st) p = multiprocessing.Process(target=self.pLen,args=(fileName,offset,rSize,)) p.start() t2 = time.time() time.sleep(2) print(u'\n下载时间共%ds\n' %(t2 - t1)) def pLen(self,fileName,offset,rSize): if offset < rSize: print('%s\t%dbytes/%dbytes\r' %(fileName,offset,rSize)), time.sleep(1) def tip(self): print('|' + '-'*40) print('|' + u'这是一个下载音悦台MV的脚本') print('|' + '-'*40)
class DownloadYinyuetaiMv(object): def __init__(self): clear() self.tip() self.log = MyLog() self.title = 'unknow' self.packageSize = 1024 * 1024 self.mvPlayUrl = self.getMvPlayUrl() def getMvPlayUrl(self): '''获取音乐台mv的播放地址 ''' self.log.info('获取mv的播放地址') self.mvPlayUrl = raw_input( '输入音乐台中MV的播放地址\n如http://v.yinyuetai.com/video/615494:\n') self.checkMvPlayUrl(self.mvPlayUrl) def checkMvPlayUrl(self, url): '''检查输入的mv播放地址是否有效 ''' self.log.info('检查mv播放地址') try: id = url.replace('http://v.yinyuetai.com/video/', '') idNum = int(id) except ValueError: self.log.error('输入的mv播放地址有误,退出程序') res = urllib2.urlopen(url, timeout=5) mat = re.compile(r'<h3 class="fl f18">(.*?)</h3>') self.title = re.findall(mat, res.read())[0] print('MV:%s' % self.title) downUrl = self.getMvDownloadUrl(id) self.downloadMv(downUrl) def getMvDownloadUrl(self, id): '''获取mv的下载地址 ''' self.log.info('获取mv下载地址') url = 'http://www.yinyuetai.com/insite/get-video-info?flex=true&videoId=' + id try: res = urllib2.urlopen(url, timeout=5) except: self.log.error('网页连接错误') mat = re.compile( r'http://h.?.yinyuetai.com/uploads/videos/common/.*?\.flv') urls = re.findall(mat, res.read()) return urls[-1] def downloadMv(self, url): '''开始下载mv ''' fileName = './' + self.title + '.mp4' res = urllib2.urlopen(url, timeout=5) self.log.info('开始下载MV %s' % fileName) rSize = int(dict(res.headers).get('content-length')) t1 = time.time() with open(fileName, 'wb') as fp: st = res.read(self.packageSize) offset = 0 while st: fp.write(st) st = res.read(self.packageSize) offset += len(st) p = multiprocessing.Process(target=self.pLen, args=( fileName, offset, rSize, )) p.start() t2 = time.time() time.sleep(2) print(u'\n下载时间共%ds\n' % (t2 - t1)) def pLen(self, fileName, offset, rSize): if offset < rSize: print('%s\t%dbytes/%dbytes\r' % (fileName, offset, rSize)), time.sleep(1) def tip(self): print('|' + '-' * 40) print('|' + u'这是一个下载音悦台MV的脚本') print('|' + '-' * 40)
from myLog import MyLog if __name__ == '__main__': ml = MyLog() ml.debug("1'm a debug message") ml.info("I'm an info message") ml.warn("I'm a warn message") ml.error("I'm an error message") ml.critical("I'm a critical message")
class WBSpider(object): ''' 属性: username:微博的用户名 password:微博的密码 driver:浏览器,默认是PhantomJS ''' def __init__(self, username, password): self.log = MyLog() #获得打印日志对象 self.username = username self.password = password self.driver = webdriver.Chrome() self.driver.implicitly_wait(5) #静静等待10s self.isLogin = 0 self.uid = "" ''' 析构函数 在销毁该类的实例的时候将浏览器关闭。 ''' def __del__(self): self.driver.close() #关闭浏览器 ''' 登录微博的函数 登陆成功则属性isLogin为1,否则为0 ''' def loginWeibo(self): #输入用户名/密码登录 self.driver.get("http://login.sina.com.cn/") self.driver.implicitly_wait(5) elem_user = self.driver.find_element_by_name("username") #找到用户名输入框 elem_user.send_keys(self.username) #传送用户名 #找到密码输入框 elem_pwd = self.driver.find_element_by_name("password") elem_pwd.send_keys(self.password) #传送密码 try: time.sleep(5) elem_pwd.send_keys(Keys.RETURN) #直接传送回车键 time.sleep(2) self.log.info('登陆成功...') self.isLogin = 1 #是否登录的标志 except: self.Log.error("Login Error") self.isLogin = 0 #是否登录的标志 ''' 设置需要爬虫微博主的Uid ''' def setUid(self, Uid): self.uid = Uid ''' 获取微博 PageNum:输入爬取微博的页数 返回:微博的列表 ''' def getWeibo(self, PageNum): total = PageNum #判断不成立的条件 if self.isLogin == 0: self.log.error("没有登录微博!") return if self.uid == "": self.log.error("待爬取的微博主的uid为空,请设置!") return if PageNum < 0: self.log.error("页数设置不合法") return #开始爬取 weiboList = [] url = "http://weibo.com/" + self.uid self.driver.get(url) self.driver.implicitly_wait(5) #爬取名称 self.log.debug("准备访问个人网站....." + str(url)) self.log.info('个人详细信息') #用户id print(u'用户id: ' + self.uid) self.driver.implicitly_wait(5) #昵称 str_name = self.driver.find_element_by_xpath( "//div[@class='pf_username']/h1") name = str_name.text #str_name.text是unicode编码类型 self.log.info("昵称:" + str(name)) self.driver.implicitly_wait(5) try: while (1): #让selenium直接滚动到下一页,用来获取“下一页”按钮 print("正在爬取第" + str(total - PageNum + 1) + "页") next_page = None try: next_page = self.driver.find_element_by_link_text('下一页') except: next_page = None Count = 0 while (next_page is None): try: next_page = self.driver.find_element_by_link_text( '下一页') except: next_page = None Count = Count + 1 print(Count) time.sleep(3) self.driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") time.sleep(3) if Count == 200: break #获取微博元素 weiboelem = self.driver.find_elements_by_xpath( "//div[@action-type='feed_list_item']/div[@node-type='feed_content']/div[@class='WB_detail']/div[@node-type='feed_list_content']" ) #将微博元素列表转换成字符串并加入到微博列表中 for i in range(len(weiboelem)): weiboList.append(weiboelem[i].text) #获得下一页按钮并点击,此处可能会出现加载不出来下一页按钮的异常 if (next_page is None): break if (PageNum == 0): self.log.info("到达尾页") break #下一页按钮被覆盖,不能clickable ActionChains(self.driver).move_to_element(next_page).click( next_page).perform() next_page.click() Pagenum = Pagenum - 1 self.driver.implicitly_wait(5) except: self.log.error("爬取异常") finally: return weiboList
def testLog(): mylog = MyLog() mylog.debug('it is debug') mylog.error("I'm error")
def delete_proxy(self, proxy): requests.get("ht(" t("http://127.0.0.1:5010/delete/?proxy={}". {} ".format(proxy)) def get_html(self, url): retry_count = 5 proxy = self.get_pro_proxy() print(proxy) while retry_count > 0: try: html = requests.get(url(url, proxies={"http": "http://{}".format(proxy)}) # 使用代理访问 return rn html.text except Exception: retry_count -= 1 # 出错5次, 删除代理池中代理 self.delete_pro_proxy(proxy) self.get_htm_html(url) logger = MyLog() def get_html(url): try: r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: logger.error('get_html出错页面为: ' + url) return " ERROR when get html" def get_content(url): print("当前爬取的网页为"+url) soup = BeautifulSoup(get_html(url),'lxml') try: forum_name = soup.find('p',attrs={'class':"nrbt"}).a.text print("当前爬取的论坛名为:"+forum_name) except: logger.error("get_forum_name出错"+"页面为"+url) forum_name = None try: time = soup.find('p',attrs={'class':"fbsj"}).text[4:] time = datetime.strptime(time,'%Y-%m-%d %H:%M') print("这个帖子的发表时间为:"+time) except: logger.error("get_publish_time出错" + "页面为" + url) time = None try: topic = soup.find('div',attrs={'class':"nr_r_c"}).find('p',attrs={'class':"contitle"}).text print("帖子的主题为:"+topic) topic = None except: logger.error("get_topic出错" + "页面为" + url) topic = None # all_neirong = [] # for part in soup.find_all('div',attrs={"class":"neirong"}): # # print(part.text) # neirong_div = part.children # neirong = '' # try: # neirong+=neirong.text # except: # None # for i in neirong_div: # try: # img = i.find('img') # if img: # neirong = neirong+'['+img.attrs['src'] + ']' # except: # None # try: # text = i.text # # print('text'+text) # neirong = neirong+text # except: # None # # print("模块的内容为:"+neirong) # if neirong != None: # all_neirong.append(neirong) print(all_neirong) # get_content("http://bbs.12365auto.com/postcontent.aspx?tID=47547&sId=1527&ppage=1&from=s") get_content("http://bbs.12365auto.com/postcontent.aspx?tID=133692&sId=1147&ppage=1&from=s")
#!/usr/bin/env python #-*- coding:utf-8 -*- from myLog import MyLog if __name__ == "__main__": ml = MyLog() ml.debug('debug') ml.info('info') ml.warn('warn') ml.error('error') ml.critical('critical')