def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.channel = channel self.entityId = 'SYSU' # self.user_name = "*****@*****.**" # self.session = SessionCrawler(sleepRange=[3,8]) # self.pass_word = "810214bee810214" # self.user_name = "15088137907" # self.pass_word = "4p2yhynrb7" self.user_name_password = self.get_username_password() self.user_name = self.user_name_password[0] self.pass_word = self.user_name_password[1] self.logger.info('username: %s' % self.user_name) self.email = SendEmail() self.db = InsertDB() self.monitor_title = '微博爬虫监控' self.proxies = '' self.session = SessionCrawler(sleepRange=[3, 8])
def __init__(self, logger=None): ''' Constructor ''' self.session = SessionCrawler() if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger
def __init__(self, channel, dbProxy, logger=None): ''' Constructor ''' self.channel = channel self.dbProxy = dbProxy self.loggingPrefix = '[%s]' % self.channel.channel_name if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger
def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.baiduCrawler = BaiduCrawler(self.logger) self.session = self.baiduCrawler.session self.channel = channel
def __init__(self, channel, logger=None): ''' 构造函数 :param channel: :param logger: ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.session = SessionCrawler(logger=self.logger) self.channel = channel
def __init__(self, channel, dbProxy, logger=None): ''' Constructor ''' self.keywordList2 = list() # 用来过滤不包含关键词的文章 self.nonekyewordList = list() # 用来过滤含有反关键词的文章 self.channel = channel self.dbProxy = dbProxy self.loggingPrefix = '[%s]' % self.channel.channel_name if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger
def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.session = SessionCrawler(logger=self.logger) self.session_public = SessionCrawler(logger=self.logger) self.session_public_article = SessionCrawler(logger=self.logger) self.channel = channel self.entityId = 'SYSU'
def __init__(self, channel, logger=None): ''' :param channel: :param logger: ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.channel = channel #获取媒体实体 # 设置请求头和代理 self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Host': 'www.jianshu.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } self.headers_1 = { 'Host': 'www.jianshu.com', 'Connection': 'keep-alive', 'Connection-Length': '0', 'Accept': 'application/json', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh-CN,zh;q=0.9' } #用户信息 self.usename = '13432881156' self.password = '******' #用cookie进行登录 self.session = SessionCrawler() self.session.cookies = cookielib.LWPCookieJar( filename='cookie') #读取cookie try: self.session.cookies.load(ignore_discard=True) except: print('未能加载cookie') if self.islogin(): print('已经登录简书') else: self.login(self.usename, self.password)
def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.site = 'tieba.baidu.com' # 搜索站点 self.url = 'http://tieba.baidu.com/f/search/res' self.tiebaUrl_list = list() self.session = SessionCrawler(sleepRange=[3, 5]) self.channel = channel self.count_page = 0 #优化时间,使进入一次便可以 self.pageList = [] #保存__searchByPage 里搜索到的url页数 self.articleList = list() #全局去重复文章变量
def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.site = 'bbs.tianya.cn' self.url = 'http://search.tianya.cn/bbs?&s=4&f=0' self.luntanUrl_list = list() self.session = SessionCrawler(sleepRange=[3, 5]) self.channel = channel self.count_page = 0 self.pageList = [] self.articleList = list() # 全局去重复文章变量
def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.channel = channel self.entityId = 'SYSU' self.user_name = None self.pass_word = None self.email = SendEmail() self.db = InsertDB() self.monitor_title = 'Weibo crawler monitor' self.HTTP = 'https:' self.session = SessionCrawler(sleepRange=[3, 8])
def __init__(self,channel,logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.session = SessionCrawler(sleepRange=[3, 8]) self.channel = channel self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.douban.com', 'Upgrade-Insecure-Requests': '1' }
def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.session = SessionCrawler(logger=self.logger) self.session_public = SessionCrawler(logger=self.logger) self.session_public_article = SessionCrawler(logger=self.logger) self.channel = channel self.entityId = 'SYSU' self.ip_list = None self.proxies = None self.monitor_title = '微信爬虫监控' self.email = SendEmail() self.db = InsertDB()
def __init__(self, channel, logger=None): ''' 构造函数 :param channel: :param logger: ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.channel = channel self.nextCommentUrl = None self.user_name_password = self.get_username_password() self.user_name = self.user_name_password[0] self.pass_word = self.user_name_password[1] # 构造headers self.agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" self.headers = { "HOST": "www.zhihu.com", "referer": "https://www.zhihu.com", "User-Agent": self.agent, "Connection": "keep-alive" } # 利用session保持连接 self.session = requests.session() self.session.cookies = cookielib.LWPCookieJar(filename='cookies.txt') try: self.session.cookies.load(ignore_discard=True, ignore_expires=True) except: self.logger.error('cookies 未能加载') if self.isLogin(): self.logger.info('已经登录知乎') else: print '未登录知乎' self.login(self.user_name, self.pass_word) self.sleepRange = [0, 1] self.lastCrawlTime = 0
def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.channel = channel # self.user_name = "*****@*****.**" # self.session = SessionCrawler(sleepRange=[3,8]) # self.pass_word = "810214bee810214" # self.user_name = "15018377821" # self.pass_word = "zzm15331411" self.user_name_password = self.get_username_password() self.user_name = self.user_name_password[0] self.pass_word = self.user_name_password[1] self.logger.info('username: %s' % self.user_name) self.session = SessionCrawler(sleepRange=[3, 8])
def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.s = requests.Session() self.channel = channel self.headers = { 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'User-Agent': 'libcurl-agent/1.0', 'Host': '120.55.151.61', 'Connection': 'Keep-Alive', 'Accept-Encoding': 'gzip', # 'Content-Length': '215', } self.req_data = 'timestamp=0&preMoodTimestap=0&phoneBrand=HONOR&platform=1&phoneVersion=24&channel=huaweiMarket&phoneModel=BLN-AL10&type=1&versionNumber=9.1.1&' self.articleList = list() self.startTime = 0
def __init__(self, channel, logger=None): ''' 构造函数 :param channel: :param logger: ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.channel = channel self.nextCommentUrl = None self.username = '******' self.password = '******' #构造request headers agent = 'Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0' self.headers = { 'Host': 'www.zhihu.com', 'Referer': 'https://www.zhihu.com', 'User-Agent': agent } #使用cookies登陆信息 self.session = requests.session() self.session.cookies = cookielib.LWPCookieJar(filename='cookies') try: self.session.cookies.load(ignore_discard=True) except: self.logger.error(u"Cookie未能加载") if self.isLogin(): self.logger.info(u'已经登录知乎') else: self.login(self.username, self.password) self.sleepRange = [0, 1] self.lastCrawlTime = 0
self.title = '' self.content ='' self.contentUrl = '' self.sourceUrl = '' self.digest = '' self.author = '' self.fileid = '' self.publishDateTime = 0 self.commentList = list() self.readCount = -1 self.likeCount = -1 if __name__ == '__main__': import platform if 'window' in platform.system().lower(): Logging.initLogger(os.path.join('conf','logging.win.cfg')) else: Logging.initLogger(os.path.join('conf','logging.cfg')) c = Configuration(os.path.join('conf',APP_NAME+'.cfg')) conf = c.readConfig() dbConf = conf[APP_NAME]['dbConf'] logger = Logging.getLogger(APP_NAME) ownerList = conf[APP_NAME]['ownerList'] for owner in ownerList: wc = WeiXinCralwer([2,5],logger) articleList = wc.crawl(owner) wc.writeDb(dbConf, owner, articleList) #wc.crawl('sysuyouth') #wc.crawl('sysudin')