예제 #1
0
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.channel = channel
        self.entityId = 'SYSU'
        # self.user_name = "*****@*****.**"
        # self.session = SessionCrawler(sleepRange=[3,8])
        # self.pass_word = "810214bee810214"
        # self.user_name = "15088137907"
        # self.pass_word = "4p2yhynrb7"
        self.user_name_password = self.get_username_password()
        self.user_name = self.user_name_password[0]
        self.pass_word = self.user_name_password[1]
        self.logger.info('username: %s' % self.user_name)

        self.email = SendEmail()
        self.db = InsertDB()
        self.monitor_title = '微博爬虫监控'
        self.proxies = ''

        self.session = SessionCrawler(sleepRange=[3, 8])
예제 #2
0
파일: baidu.py 프로젝트: magicgis/outfile
 def __init__(self, logger=None):
     '''
     Constructor
     '''
     self.session = SessionCrawler()
     if logger is None:
         self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
     else:
         self.logger = logger
예제 #3
0
 def __init__(self, channel, dbProxy, logger=None):
     '''
     Constructor
     '''
     self.channel = channel
     self.dbProxy = dbProxy
     self.loggingPrefix = '[%s]' % self.channel.channel_name
     if logger is None:
         self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
     else:
         self.logger = logger
예제 #4
0
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.baiduCrawler = BaiduCrawler(self.logger)
        self.session = self.baiduCrawler.session
        self.channel = channel
예제 #5
0
    def __init__(self, channel, logger=None):
        '''
        构造函数
        :param channel:
        :param logger:
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.session = SessionCrawler(logger=self.logger)
        self.channel = channel
예제 #6
0
 def __init__(self, channel, dbProxy, logger=None):
     '''
     Constructor
     '''
     self.keywordList2 = list()  # 用来过滤不包含关键词的文章
     self.nonekyewordList = list()  # 用来过滤含有反关键词的文章
     self.channel = channel
     self.dbProxy = dbProxy
     self.loggingPrefix = '[%s]' % self.channel.channel_name
     if logger is None:
         self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
     else:
         self.logger = logger
예제 #7
0
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.session = SessionCrawler(logger=self.logger)
        self.session_public = SessionCrawler(logger=self.logger)
        self.session_public_article = SessionCrawler(logger=self.logger)
        self.channel = channel
        self.entityId = 'SYSU'
예제 #8
0
파일: jianshu.py 프로젝트: magicgis/outfile
    def __init__(self, channel, logger=None):
        '''

        :param channel:
        :param logger:
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger
        self.channel = channel  #获取媒体实体
        # 设置请求头和代理
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Host':
            'www.jianshu.com',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        }
        self.headers_1 = {
            'Host': 'www.jianshu.com',
            'Connection': 'keep-alive',
            'Connection-Length': '0',
            'Accept': 'application/json',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept-Encoding': 'gzip,deflate,br',
            'Accept-Language': 'zh-CN,zh;q=0.9'
        }
        #用户信息
        self.usename = '13432881156'
        self.password = '******'
        #用cookie进行登录
        self.session = SessionCrawler()
        self.session.cookies = cookielib.LWPCookieJar(
            filename='cookie')  #读取cookie
        try:
            self.session.cookies.load(ignore_discard=True)
        except:
            print('未能加载cookie')
        if self.islogin():
            print('已经登录简书')
        else:
            self.login(self.usename, self.password)
예제 #9
0
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger
        self.site = 'tieba.baidu.com'  # 搜索站点
        self.url = 'http://tieba.baidu.com/f/search/res'
        self.tiebaUrl_list = list()
        self.session = SessionCrawler(sleepRange=[3, 5])
        self.channel = channel

        self.count_page = 0  #优化时间,使进入一次便可以
        self.pageList = []  #保存__searchByPage 里搜索到的url页数
        self.articleList = list()  #全局去重复文章变量
예제 #10
0
파일: tianya.py 프로젝트: magicgis/outfile
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger
        self.site = 'bbs.tianya.cn'
        self.url = 'http://search.tianya.cn/bbs?&s=4&f=0'
        self.luntanUrl_list = list()
        self.session = SessionCrawler(sleepRange=[3, 5])
        self.channel = channel

        self.count_page = 0
        self.pageList = []
        self.articleList = list()  # 全局去重复文章变量
예제 #11
0
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.channel = channel
        self.entityId = 'SYSU'
        self.user_name = None
        self.pass_word = None

        self.email = SendEmail()
        self.db = InsertDB()
        self.monitor_title = 'Weibo crawler monitor'
        self.HTTP = 'https:'

        self.session = SessionCrawler(sleepRange=[3, 8])
예제 #12
0
    def __init__(self,channel,logger=None):
        '''
        Constructor
        '''

        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger
        self.session = SessionCrawler(sleepRange=[3, 8])
        self.channel = channel
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'www.douban.com',
            'Upgrade-Insecure-Requests': '1'
        }
예제 #13
0
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.session = SessionCrawler(logger=self.logger)
        self.session_public = SessionCrawler(logger=self.logger)
        self.session_public_article = SessionCrawler(logger=self.logger)
        self.channel = channel
        self.entityId = 'SYSU'

        self.ip_list = None
        self.proxies = None
        self.monitor_title = '微信爬虫监控'
        self.email = SendEmail()
        self.db = InsertDB()
예제 #14
0
파일: zhihu.py 프로젝트: magicgis/outfile
    def __init__(self, channel, logger=None):
        '''
        构造函数
        :param channel:
        :param logger:
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger
        self.channel = channel
        self.nextCommentUrl = None
        self.user_name_password = self.get_username_password()
        self.user_name = self.user_name_password[0]
        self.pass_word = self.user_name_password[1]

        # 构造headers
        self.agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
        self.headers = {
            "HOST": "www.zhihu.com",
            "referer": "https://www.zhihu.com",
            "User-Agent": self.agent,
            "Connection": "keep-alive"
        }

        # 利用session保持连接
        self.session = requests.session()
        self.session.cookies = cookielib.LWPCookieJar(filename='cookies.txt')
        try:
            self.session.cookies.load(ignore_discard=True, ignore_expires=True)
        except:
            self.logger.error('cookies 未能加载')

        if self.isLogin():
            self.logger.info('已经登录知乎')
        else:
            print '未登录知乎'
            self.login(self.user_name, self.pass_word)

        self.sleepRange = [0, 1]
        self.lastCrawlTime = 0
예제 #15
0
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.channel = channel
        # self.user_name = "*****@*****.**"
        # self.session = SessionCrawler(sleepRange=[3,8])
        # self.pass_word = "810214bee810214"
        # self.user_name = "15018377821"
        # self.pass_word = "zzm15331411"
        self.user_name_password = self.get_username_password()
        self.user_name = self.user_name_password[0]
        self.pass_word = self.user_name_password[1]
        self.logger.info('username: %s' % self.user_name)

        self.session = SessionCrawler(sleepRange=[3, 8])
예제 #16
0
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.s = requests.Session()
        self.channel = channel
        self.headers = {
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'User-Agent': 'libcurl-agent/1.0',
            'Host': '120.55.151.61',
            'Connection': 'Keep-Alive',
            'Accept-Encoding': 'gzip',
            # 'Content-Length': '215',
        }
        self.req_data = 'timestamp=0&preMoodTimestap=0&phoneBrand=HONOR&platform=1&phoneVersion=24&channel=huaweiMarket&phoneModel=BLN-AL10&type=1&versionNumber=9.1.1&'
        self.articleList = list()
        self.startTime = 0
예제 #17
0
    def __init__(self, channel, logger=None):
        '''
        构造函数
        :param channel:
        :param logger:
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger
        self.channel = channel
        self.nextCommentUrl = None
        self.username = '******'
        self.password = '******'

        #构造request headers
        agent = 'Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0'
        self.headers = {
            'Host': 'www.zhihu.com',
            'Referer': 'https://www.zhihu.com',
            'User-Agent': agent
        }

        #使用cookies登陆信息
        self.session = requests.session()
        self.session.cookies = cookielib.LWPCookieJar(filename='cookies')
        try:
            self.session.cookies.load(ignore_discard=True)
        except:
            self.logger.error(u"Cookie未能加载")

        if self.isLogin():
            self.logger.info(u'已经登录知乎')
        else:
            self.login(self.username, self.password)
        self.sleepRange = [0, 1]
        self.lastCrawlTime = 0
예제 #18
0
파일: weixin.py 프로젝트: magicgis/outfile
        self.title = ''
        self.content =''
        self.contentUrl = ''        
        self.sourceUrl = ''
        self.digest = ''
        self.author = ''
        self.fileid = ''
        self.publishDateTime = 0
        self.commentList = list()
        self.readCount = -1
        self.likeCount = -1        

if __name__ == '__main__':
    import platform
    if 'window' in platform.system().lower():
        Logging.initLogger(os.path.join('conf','logging.win.cfg'))
    else:
        Logging.initLogger(os.path.join('conf','logging.cfg'))
    c = Configuration(os.path.join('conf',APP_NAME+'.cfg'))
    conf = c.readConfig()
    dbConf = conf[APP_NAME]['dbConf']

    logger = Logging.getLogger(APP_NAME)
    
    ownerList = conf[APP_NAME]['ownerList']
    for owner in ownerList:
        wc = WeiXinCralwer([2,5],logger)
        articleList = wc.crawl(owner)
        wc.writeDb(dbConf, owner, articleList)
    #wc.crawl('sysuyouth')
    #wc.crawl('sysudin')