Exemple #1
0
 def process_item(self, item, spider):
     m1 = MyLog()
     today = time.strftime('%Y%m%d', time.localtime())
     fileName = 'weather' + today + '.json'
     m1.error('转换json开始')
     with codecs.open(fileName, 'a', encoding='utf8') as fp:
         line = json.dumps(dict(item), ensure_ascii=False) + '\n'
         fp.write(line)
     m1.warn('转换json结束')
     return item
Exemple #2
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from myLog import MyLog

if __name__ == '__main__':
    ml = MyLog()
    ml.debug("I am the debug message")
    ml.info("I am the info message")
    ml.warn("I am the warn message")
    ml.error("I am the error message")
    ml.critical("I am the critical message")
Exemple #3
0
#!/usr/bin/env python
#-*- coding: utf-8 -*-
__author__ = 'hstking [email protected]'

from myLog import MyLog

if __name__ == '__main__':
    ml = MyLog()
    ml.debug('I am debug message')
    ml.info('I am info message')
    ml.warn('I am warn message')
    ml.error('I am error message')
    ml.critical('I am critical message')
Exemple #4
0
class DownloadYinyuetaiMv(object):
	def __init__(self):
		clear()
		self.tip()
		self.log = MyLog()
		self.title = 'unknow'
		self.packageSize = 1024*1024
		self.mvPlayUrl = self.getMvPlayUrl()

	def getMvPlayUrl(self):
		'''获取音乐台mv的播放地址 '''
		self.log.info('获取mv的播放地址')
		self.mvPlayUrl = raw_input('输入音乐台中MV的播放地址\n如http://v.yinyuetai.com/video/615494:\n')
		self.checkMvPlayUrl(self.mvPlayUrl)
			

	def checkMvPlayUrl(self,url):
		'''检查输入的mv播放地址是否有效 '''
		self.log.info('检查mv播放地址')
		try:
			id = url.replace('http://v.yinyuetai.com/video/','')
			idNum = int(id)
		except ValueError:
			self.log.error('输入的mv播放地址有误,退出程序')
		res = urllib2.urlopen(url,timeout=5)
		mat = re.compile(r'<h3 class="fl f18">(.*?)</h3>')
		self.title = re.findall(mat,res.read())[0]

		print('MV:%s' %self.title)

		downUrl = self.getMvDownloadUrl(id)
		self.downloadMv(downUrl)

	def getMvDownloadUrl(self,id):
		'''获取mv的下载地址 '''
		self.log.info('获取mv下载地址')
		url = 'http://www.yinyuetai.com/insite/get-video-info?flex=true&videoId=' + id
		try:
			res = urllib2.urlopen(url,timeout=5)
		except:
			self.log.error('网页连接错误')
		mat = re.compile(r'http://h.?.yinyuetai.com/uploads/videos/common/.*?\.flv')
		urls = re.findall(mat,res.read())
		return urls[-1]

	def downloadMv(self,url):
		'''开始下载mv '''
		fileName = './' + self.title + '.mp4'
		res = urllib2.urlopen(url,timeout=5)
		self.log.info('开始下载MV %s' %fileName)
		rSize = int(dict(res.headers).get('content-length'))
		t1 = time.time()
		with open(fileName,'wb') as fp:
			st = res.read(self.packageSize)
			offset = 0
			while st:
				fp.write(st)
				st = res.read(self.packageSize)
				offset += len(st)
				p = multiprocessing.Process(target=self.pLen,args=(fileName,offset,rSize,))
				p.start()
		t2 = time.time()
		time.sleep(2)
		print(u'\n下载时间共%ds\n' %(t2 - t1))

	def pLen(self,fileName,offset,rSize):
		if offset < rSize:
			print('%s\t%dbytes/%dbytes\r' %(fileName,offset,rSize)),
			time.sleep(1)

	def tip(self):
		print('|' + '-'*40)
		print('|' + u'这是一个下载音悦台MV的脚本')
		print('|' + '-'*40)
Exemple #5
0
class DownloadYinyuetaiMv(object):
    def __init__(self):
        clear()
        self.tip()
        self.log = MyLog()
        self.title = 'unknow'
        self.packageSize = 1024 * 1024
        self.mvPlayUrl = self.getMvPlayUrl()

    def getMvPlayUrl(self):
        '''获取音乐台mv的播放地址 '''
        self.log.info('获取mv的播放地址')
        self.mvPlayUrl = raw_input(
            '输入音乐台中MV的播放地址\n如http://v.yinyuetai.com/video/615494:\n')
        self.checkMvPlayUrl(self.mvPlayUrl)

    def checkMvPlayUrl(self, url):
        '''检查输入的mv播放地址是否有效 '''
        self.log.info('检查mv播放地址')
        try:
            id = url.replace('http://v.yinyuetai.com/video/', '')
            idNum = int(id)
        except ValueError:
            self.log.error('输入的mv播放地址有误,退出程序')
        res = urllib2.urlopen(url, timeout=5)
        mat = re.compile(r'<h3 class="fl f18">(.*?)</h3>')
        self.title = re.findall(mat, res.read())[0]

        print('MV:%s' % self.title)

        downUrl = self.getMvDownloadUrl(id)
        self.downloadMv(downUrl)

    def getMvDownloadUrl(self, id):
        '''获取mv的下载地址 '''
        self.log.info('获取mv下载地址')
        url = 'http://www.yinyuetai.com/insite/get-video-info?flex=true&videoId=' + id
        try:
            res = urllib2.urlopen(url, timeout=5)
        except:
            self.log.error('网页连接错误')
        mat = re.compile(
            r'http://h.?.yinyuetai.com/uploads/videos/common/.*?\.flv')
        urls = re.findall(mat, res.read())
        return urls[-1]

    def downloadMv(self, url):
        '''开始下载mv '''
        fileName = './' + self.title + '.mp4'
        res = urllib2.urlopen(url, timeout=5)
        self.log.info('开始下载MV %s' % fileName)
        rSize = int(dict(res.headers).get('content-length'))
        t1 = time.time()
        with open(fileName, 'wb') as fp:
            st = res.read(self.packageSize)
            offset = 0
            while st:
                fp.write(st)
                st = res.read(self.packageSize)
                offset += len(st)
                p = multiprocessing.Process(target=self.pLen,
                                            args=(
                                                fileName,
                                                offset,
                                                rSize,
                                            ))
                p.start()
        t2 = time.time()
        time.sleep(2)
        print(u'\n下载时间共%ds\n' % (t2 - t1))

    def pLen(self, fileName, offset, rSize):
        if offset < rSize:
            print('%s\t%dbytes/%dbytes\r' % (fileName, offset, rSize)),
            time.sleep(1)

    def tip(self):
        print('|' + '-' * 40)
        print('|' + u'这是一个下载音悦台MV的脚本')
        print('|' + '-' * 40)
Exemple #6
0
from myLog import MyLog
if __name__ == '__main__':
    ml = MyLog()
    ml.debug("1'm a debug message")
    ml.info("I'm an info message")
    ml.warn("I'm a warn message")
    ml.error("I'm an error message")
    ml.critical("I'm a critical message")
Exemple #7
0
class WBSpider(object):
    '''
    属性:
        username:微博的用户名
        password:微博的密码
        driver:浏览器,默认是PhantomJS
    '''
    def __init__(self, username, password):
        self.log = MyLog()  #获得打印日志对象
        self.username = username
        self.password = password
        self.driver = webdriver.Chrome()
        self.driver.implicitly_wait(5)  #静静等待10s
        self.isLogin = 0
        self.uid = ""

    '''
    析构函数
    在销毁该类的实例的时候将浏览器关闭。
    '''

    def __del__(self):
        self.driver.close()  #关闭浏览器

    '''
    登录微博的函数
    登陆成功则属性isLogin为1,否则为0
    '''

    def loginWeibo(self):
        #输入用户名/密码登录
        self.driver.get("http://login.sina.com.cn/")
        self.driver.implicitly_wait(5)
        elem_user = self.driver.find_element_by_name("username")  #找到用户名输入框
        elem_user.send_keys(self.username)  #传送用户名
        #找到密码输入框
        elem_pwd = self.driver.find_element_by_name("password")
        elem_pwd.send_keys(self.password)  #传送密码
        try:
            time.sleep(5)
            elem_pwd.send_keys(Keys.RETURN)  #直接传送回车键
            time.sleep(2)
            self.log.info('登陆成功...')
            self.isLogin = 1  #是否登录的标志
        except:
            self.Log.error("Login Error")
            self.isLogin = 0  #是否登录的标志

    '''
    设置需要爬虫微博主的Uid
    '''

    def setUid(self, Uid):
        self.uid = Uid

    '''
    获取微博
    PageNum:输入爬取微博的页数
    返回:微博的列表
    '''

    def getWeibo(self, PageNum):
        total = PageNum
        #判断不成立的条件
        if self.isLogin == 0:
            self.log.error("没有登录微博!")
            return
        if self.uid == "":
            self.log.error("待爬取的微博主的uid为空,请设置!")
            return
        if PageNum < 0:
            self.log.error("页数设置不合法")
            return
        #开始爬取
        weiboList = []
        url = "http://weibo.com/" + self.uid
        self.driver.get(url)
        self.driver.implicitly_wait(5)
        #爬取名称
        self.log.debug("准备访问个人网站....." + str(url))
        self.log.info('个人详细信息')
        #用户id
        print(u'用户id: ' + self.uid)
        self.driver.implicitly_wait(5)
        #昵称
        str_name = self.driver.find_element_by_xpath(
            "//div[@class='pf_username']/h1")
        name = str_name.text  #str_name.text是unicode编码类型
        self.log.info("昵称:" + str(name))
        self.driver.implicitly_wait(5)
        try:
            while (1):
                #让selenium直接滚动到下一页,用来获取“下一页”按钮
                print("正在爬取第" + str(total - PageNum + 1) + "页")
                next_page = None
                try:
                    next_page = self.driver.find_element_by_link_text('下一页')
                except:
                    next_page = None
                Count = 0
                while (next_page is None):
                    try:
                        next_page = self.driver.find_element_by_link_text(
                            '下一页')
                    except:
                        next_page = None
                    Count = Count + 1
                    print(Count)
                    time.sleep(3)
                    self.driver.execute_script(
                        "window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(3)
                    if Count == 200:
                        break
                #获取微博元素
                weiboelem = self.driver.find_elements_by_xpath(
                    "//div[@action-type='feed_list_item']/div[@node-type='feed_content']/div[@class='WB_detail']/div[@node-type='feed_list_content']"
                )
                #将微博元素列表转换成字符串并加入到微博列表中
                for i in range(len(weiboelem)):
                    weiboList.append(weiboelem[i].text)
                #获得下一页按钮并点击,此处可能会出现加载不出来下一页按钮的异常
                if (next_page is None):
                    break
                if (PageNum == 0):
                    self.log.info("到达尾页")
                    break
                #下一页按钮被覆盖,不能clickable
                ActionChains(self.driver).move_to_element(next_page).click(
                    next_page).perform()
                next_page.click()
                Pagenum = Pagenum - 1
                self.driver.implicitly_wait(5)
        except:
            self.log.error("爬取异常")
        finally:
            return weiboList
Exemple #8
0
def testLog():
    mylog = MyLog()
    mylog.debug('it is debug')
    mylog.error("I'm error")
Exemple #9
0
def delete_proxy(self, proxy):
    requests.get("ht("
    t("http://127.0.0.1:5010/delete/?proxy={}".
    {}
    ".format(proxy))


def get_html(self, url):
    retry_count = 5
    proxy = self.get_pro_proxy()
    print(proxy)
    while retry_count > 0:
        try:
            html = requests.get(url(url, proxies={"http": "http://{}".format(proxy)})
            # 使用代理访问
            return rn
            html.text

        except Exception:
            retry_count -= 1
        # 出错5次, 删除代理池中代理
    self.delete_pro_proxy(proxy)
    self.get_htm_html(url)
logger = MyLog()
def get_html(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        logger.error('get_html出错页面为: ' + url)
        return " ERROR when get html"
def get_content(url):
    print("当前爬取的网页为"+url)
    soup = BeautifulSoup(get_html(url),'lxml')
    try:
        forum_name = soup.find('p',attrs={'class':"nrbt"}).a.text
        print("当前爬取的论坛名为:"+forum_name)
    except:
        logger.error("get_forum_name出错"+"页面为"+url)
        forum_name = None
    try:
        time = soup.find('p',attrs={'class':"fbsj"}).text[4:]
        time = datetime.strptime(time,'%Y-%m-%d %H:%M')
        print("这个帖子的发表时间为:"+time)
    except:
        logger.error("get_publish_time出错" + "页面为" + url)
        time = None
    try:
        topic = soup.find('div',attrs={'class':"nr_r_c"}).find('p',attrs={'class':"contitle"}).text
        print("帖子的主题为:"+topic)
        topic = None
    except:
        logger.error("get_topic出错" + "页面为" + url)
        topic = None
    # all_neirong = []
    # for part in soup.find_all('div',attrs={"class":"neirong"}):
    #     # print(part.text)
    #     neirong_div = part.children
    #     neirong = ''
    #     try:
    #         neirong+=neirong.text
    #     except:
    #         None
    #     for i in neirong_div:
    #         try:
    #             img = i.find('img')
    #             if img:
    #                 neirong = neirong+'['+img.attrs['src'] + ']'
    #         except:
    #             None
    #             try:
    #                 text = i.text
    #                 # print('text'+text)
    #                 neirong = neirong+text
    #             except:
    #                 None
    #     # print("模块的内容为:"+neirong)
    #     if neirong != None:
    #         all_neirong.append(neirong)


    print(all_neirong)









# get_content("http://bbs.12365auto.com/postcontent.aspx?tID=47547&sId=1527&ppage=1&from=s")
get_content("http://bbs.12365auto.com/postcontent.aspx?tID=133692&sId=1147&ppage=1&from=s")
Exemple #10
0
#!/usr/bin/env python
#-*- coding:utf-8 -*-
from myLog import MyLog

if __name__ == "__main__":
    ml = MyLog()
    ml.debug('debug')
    ml.info('info')
    ml.warn('warn')
    ml.error('error')
    ml.critical('critical')