Exemple #1
0
class UploadMongoData():
    def __init__(self):
        self.settings = Settings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.log_path = self.settings.LOG_PATH
        self.doraemon.createFilePath(self.log_path)

    def startUpload(self):
        fromFile = self.settings.LOCAL_MONGO_DATA_PATH
        toFile = self.settings.REMOTE_MONGO_DATA_PATH
        if not os.path.exists(fromFile):
            print 'no mongo data file to upload'
            return
        while os.path.exists(fromFile):
            try:
                if self.doraemon.sshUpload(
                        self.settings.IP_WEBSERVER0,
                        self.settings.PORT_WEBSERVER0,
                        self.settings.USER_ROOT_WEBSERVER0,
                        self.settings.USER_ROOT_PASSWORD_WEBSERVER0, fromFile,
                        toFile):
                    self.doraemon.deleteFile(fromFile)
                    message1 = 'Success to upload mongo data file: {0}'.format(
                        fromFile)
                    print message1
                    self.file.logger(self.log_path, message1)
            except Exception as e:
                message2 = 'Exception {0} to upload mongo data file: {1}'.format(
                    e.message, fromFile)
                print message2
                self.file.logger(self.log_path, message2)
Exemple #2
0
class FengReceptorContent():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)

    def getSettings(self):
        self.work_path_prd2 = "/home/dev/Data/rsyncData/test/"
        self.mongo = "feng_receptor_content"
        self.finished_ids = "feng_receptor_content"
        self.log_path = "/home/dev/Data/rsyncData/test/"

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        key = response['request_title'].strip()
        href_contens = html.xpath(".//*[contains(@class, 'newLine-4rktaWav')]")
        if len(href_contens) == 0:
            print 'No data for: {0}'.format(key)
            return
        texts = href_contens[0].xpath(
            ".//*[contains(@class, 'time-RyJJYUOX')]/text()")
        time_source = ''.join(texts).strip()
        self.doraemon.hashSet(self.finished_ids, current_url, current_url)
        data = {'id': key, 'url': current_url, 'date': time_source}
        print 'Start to store mongo {0}'.format(data['url'])
        self.doraemon.storeMongodb(self.mongo, data)
        print 'Finished for {0}'.format(key)

    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        file_path = '/home/dev/Data/rsyncData/test/feng_receptor.csv'
        items = self.file.readFromCSV(file_path)
        items.pop(0)

        for item in items:
            key = item[0]
            if key not in all_finished_id:
                name = key.strip()
                url = item[1]
                new_urls.append([url, name])

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls,
                             2,
                             self.log_path,
                             None,
                             callback=self.parse)
Exemple #3
0
class ChuansongmeReceptor():

    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)

    def getSettings(self):
        self.work_path_prd2 = "/home/dev/Data/rsyncData/"
        self.mongo = "gongzhonghao_test"
        self.finished_ids = "gongzhonghao_test"
        self.log_path = "/home/dev/Data/rsyncData/"

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        key = response['request_title'].strip()
        href_item = html.xpath("./*[contains(@class, 'pagedlist_item')]")
        if len(href_item) == 0:
            print 'No data for: {0}'.format(key)
            return
        self.doraemon.hashSet(self.finished_ids, key, key)
        data = {
            'id': key,
            'url': current_url
        }
        print 'Start to store mongo {0}'.format(data['url'])
        self.doraemon.storeMongodb(self.mongo, data)
        print 'Finished for {0}'.format(key)

    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        txt_path = '/home/dev/Data/rsyncData/gongzhonghao_test.txt'
        gonzhonghao = self.file.readFromTxt(txt_path)
        keys = gonzhonghao.split('\n')

        for key in keys:
            if key not in all_finished_id:
                tmp_url = "https://chuansongme.com/account/{0}".format(key)
                new_urls.append([tmp_url, key])

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls, 2, self.log_path, None, callback=self.parse)
Exemple #4
0
class XueqiuReceptor():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)

    def getSettings(self):
        self.work_path_prd2 = "/home/dev/Data/rsyncData/test/"
        self.mongo = "xueqiu_test"
        self.finished_ids = "xueqiu_test"
        self.log_path = "/home/dev/Data/rsyncData/test/"

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        key = response['request_title'].strip()
        href_contens = html.xpath(
            ".//*[contains(@class, 'search__user__card__content')]")
        if len(href_contens) == 0:
            print 'No data for: {0}'.format(key)
            return
        for item in href_contens:
            href = item.xpath(".//*[contains(@class, 'user-name')]/@href")
            title_content = item.xpath(
                ".//*[contains(@class, 'user-name')]//span/text()")
            title = "".join(title_content).strip()
            if len(href) > 0 and title == key:
                url = "https://xueqiu.com/u{0}".format(href[0])
                self.doraemon.hashSet(self.finished_ids, url, url)
                data = {'id': key, 'url': url}
                print 'Start to store mongo {0}'.format(data['url'])
                self.doraemon.storeMongodb(self.mongo, data)
                print 'Finished for {0}'.format(key)

    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        txt_path = '/home/dev/Data/rsyncData/test/xueqiu.txt'
        gonzhonghao = self.file.readFromTxt(txt_path)
        keys = gonzhonghao.split('\n')

        for key in keys:
            if key not in all_finished_id:
                name = key.strip()
                tmp_url = "https://xueqiu.com/k?q={0}".format(name)
                new_urls.append([tmp_url, name])

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls,
                             5,
                             self.log_path,
                             None,
                             callback=self.parse)
Exemple #5
0
class WoshipmReceptor():

    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)

    def getSettings(self):
        self.work_path_prd2 = "/home/dev/Data/rsyncData/test/"
        self.mongo = "whoispm_receptor"
        self.finished_ids = "woshipm_receptor"
        self.log_path = "/home/dev/Data/rsyncData/test/"
        self.regx = re.compile("/u/[0-9]{0,}")

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        key = response['request_title'].strip()
        href_contens = html.xpath("./a")
        if len(href_contens) == 0:
            print 'No data for: {0}'.format(key)
            return
        for item in href_contens:
            href = item.xpath("@href")
            title_content = item.xpath(".//text()")
            title = "".join(title_content).strip()
            if len(href) > 0 and title == key:
                isValidUrl = self.regx.match(href[0])
                if isValidUrl is None:
                    print 'Invalid url for not match: {0}'.format(href[0])
                    continue
                url = "http://www.woshipm.com{0}".format(href[0])
                self.doraemon.hashSet(self.finished_ids, url, url)
                data = {
                    'id': key,
                    'url': url
                }
                print 'Start to store mongo {0}'.format(data['url'])
                self.doraemon.storeMongodb(self.mongo, data)
                print 'Finished for {0}'.format(key)

    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        txt_path = '/home/dev/Data/rsyncData/test/woshipm_receptor.txt'
        gonzhonghao = self.file.readFromTxt(txt_path)
        keys = gonzhonghao.split('\n')

        for key in keys:
            key = key.strip()
            if key not in all_finished_id:
                name = key.strip()
                tmp_url = "http://www.woshipm.com/search-posts?k={0}".format(name)
                new_urls.append([tmp_url, name])
            else:
                print 'Finished or no data for {0}'.format(key)
                self.doraemon.hashSet(self.finished_ids, key, key)

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls, 2, self.log_path, None, callback=self.parse)
Exemple #6
0
class FengReceptor():

    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)

    def getSettings(self):
        self.work_path_prd2 = "/home/dev/Data/rsyncData/test/"
        self.mongo = "feng_receptor"
        self.finished_ids = "feng_receptor"
        self.log_path = "/home/dev/Data/rsyncData/test/"

    def parse(self, response):
        time.sleep(1)
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        key = response['request_title'].strip()
        str = response['response'].page_source.encode('utf-8')
        str_n = str[str.find('(') + 1:-21]
        str_n = str_n.replace('null', 'None')
        dics = eval(str_n)
        if len(dics['items']) == 0:
            print 'No data for: {0}'.format(key)
            self.doraemon.hashSet(self.finished_ids, key, key)
            return
        for item in dics['items']:
            name = item['name'].replace('<','').replace('em>','').replace('\\/','')
            id = item['id']
            if len(id) > 0 and name == key:
                url = "https://feng.ifeng.com/author/{0}".format(id)
                self.doraemon.hashSet(self.finished_ids, key, key)
                data = {
                    'id': key,
                    'url': url
                }
                print 'Start to store mongo {0}'.format(data['url'])
                self.doraemon.storeMongodb(self.mongo, data)
                print 'Finished for {0}'.format(key)

    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        txt_path = '/home/dev/Data/rsyncData/test/feng_receptor.txt'
        gonzhonghao = self.file.readFromTxt(txt_path)
        keys = gonzhonghao.split('\n')

        for key in keys:
            key = key.strip()
            if key not in all_finished_id:
                name = key.strip()
                tmp_url = "https://so.v.ifeng.com/websearch/ifeng-search-server/sub/websearch?k={0}&page=1&distinct=1&n=10&hl=1&os=ios&gv=6.2.5&uid=70b6a1d8f6c64618bf9dfa092fc4e34c&callback=getData".format(name)
                new_urls.append([tmp_url, name])
            else:
                print 'Finished or no data for {0}'.format(key)
                self.doraemon.hashSet(self.finished_ids, key, key)

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls, 5, self.log_path, None, callback=self.parse)
Exemple #7
0
class CamelBone():
    def __init__(self, siteinfo=None, callback=callable):
        self.siteinfo = siteinfo
        self.callBack = callback
        self.globalSettings = Settings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.getSettings()

    def getSettings(self):
        self.settings = self.globalSettings.CreateSettings(self.siteinfo)
        self.log_path = self.globalSettings.LOG_PATH_PRD2
        self.today = self.globalSettings.TODAY
        self.source = self.settings.SOURCE_NAME
        self.work_path_prd2 = self.settings.WORK_PATH_PRD2
        self.mongo = self.settings.MONGO_URLS
        self.name = self.settings.NAME
        self.max_pool_size = self.settings.MAX_POOL_SIZE_URL
        self.urls = self.settings.URLS
        self.max_concurrency = self.globalSettings.MAX_CONCURRENCY
        self.concurrency_file = self.globalSettings.CONCURRENCY_FILE
        self.url_backup_folder_path = self.settings.URL_BACKUP_FOLDER_PATH
        self.url_timeout = self.settings.URL_TIMEOUT
        self.createPath()

    def createPath(self):
        self.doraemon.createFilePath(self.work_path_prd2)
        self.doraemon.createFilePath(self.log_path)
        self.doraemon.createFilePath(self.url_backup_folder_path)

    def parse(self, response):
        time.sleep(1)
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        results = self.callBack(current_url, html)
        if len(results) == 0:
            message1 = 'No url for page: {0}'.format(current_url)
            self.file.logger(self.log_path, message1)
            print message1
        for item in results:
            is_title_empty = self.doraemon.isEmpty(item.title)
            if (is_title_empty is False) and (self.doraemon.isDuplicated(
                    self.doraemon.bf_urls, item.title) is False):
                message2 = 'Start to store mongo {0}'.format(item.url)
                self.file.logger(self.log_path, message2)
                print message2
                self.doraemon.storeMongodb(
                    self.mongo, self.doraemon.createCamelMongoJson(item))
                message3 = 'End to store mongo {0}'.format(item.url)
                self.file.logger(self.log_path, message3)
                print message3
                self.file.logger(self.log_path,
                                 'Done for {0}'.format(item.url))
            else:
                if is_title_empty is True:
                    message4 = 'Empty title for {0}'.format(item.url)
                    self.file.logger(self.log_path, message4)
                    print message4
                else:
                    print 'Finished title for {0}'.format(item.url)
        print 'End to parse {0}'.format(current_url)

        del current_url, results, html
        gc.collect()

    def start(self, isdebug=False):
        if self.doraemon.isCamelReadyToRun(
                self.settings) is False and isdebug is False:
            message5 = 'It is not ready to run for {0}'.format(self.name)
            print message5
            return
        message6 = 'Start {0} requests'.format(self.name)
        self.file.logger(self.log_path, message6)
        print message6

        new_urls = []
        content = self.file.readFromTxt(self.urls)
        url_list = content.split('\n')

        for url in url_list:
            if self.doraemon.isEmpty(url) is False:
                new_urls.append([url, ''])

        if len(new_urls) == 0:
            print 'No url.'
            return
        request = BrowserRequest()
        content = request.start_chrome(new_urls,
                                       self.url_timeout,
                                       self.max_pool_size,
                                       self.log_path,
                                       None,
                                       callback=self.parse)
        self.doraemon.recoveryConcurrency(self.concurrency_file,
                                          self.max_concurrency)
        message7 = 'End for {0} requests of {1}.'.format(
            str(len(content)), self.name)
        self.file.logger(self.log_path, message7)
        print message7

        del new_urls, content, url_list, request
        gc.collect()
Exemple #8
0
class SpiderBone():
    def __init__(self, siteinfo=None, callback=callable):
        self.siteinfo = siteinfo
        self.callBack = callback
        self.globalSettings = Settings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.getSettings()

    def getSettings(self):
        self.settings = self.globalSettings.CreateSettings(self.siteinfo)
        self.log_path = self.globalSettings.LOG_PATH
        self.today = self.globalSettings.TODAY
        self.source = self.settings.SOURCE_NAME
        self.work_path_prd1 = self.settings.WORK_PATH_PRD1
        self.finished_txt_path = self.settings.FINISHED_TXT_PATH
        self.finished_html_path = self.settings.FINISHED_HTML_PATH
        self.finished_image_path = self.settings.FINISHED_IMG_PATH
        self.template_path = self.globalSettings.TEMPLATE_PATH
        self.article_url = self.globalSettings.ARTICLE_URL
        self.ali_domain = self.globalSettings.ALI_DOMAIN
        self.ali_domain_deepinews = self.globalSettings.ALI_BUCKET_NAME_DEEPINEWS
        self.ali_domain_deepinews_img = self.globalSettings.ALI_BUCKET_NAME_DEEPINEWS_IMG
        self.ip_webserver0 = self.globalSettings.IP_WEBSERVER0
        self.port_webserver0 = self.globalSettings.PORT_WEBSERVER0
        self.user_root_webserver0 = self.globalSettings.USER_ROOT_WEBSERVER0
        self.user_root_password_webserver0 = self.globalSettings.USER_ROOT_PASSWORD_WEBSERVER0
        self.html_webserver0 = self.globalSettings.HTML_WEBSERVER0
        self.mongo = self.settings.MONGO
        self.name = self.settings.NAME
        self.max_pool_size = self.settings.MAX_POOL_SIZE_CONTENT
        self.url_path = self.settings.URL_PATH
        self.is_open_cache = self.settings.IS_OPEN_CACHE
        self.finished_backup_folder_path = self.settings.FINISHED_BACKUP_FOLDER_PATH
        self.max_concurrency_spider = self.globalSettings.MAX_CONCURRENCY_SPIDER
        self.concurrency_file_spider = self.globalSettings.CONCURRENCY_FILE_SPIDER
        self.monitor_upload_local = self.globalSettings.MONITOR_UPLOAD_LOCAL
        self.local_html_path = self.globalSettings.LOCAL_HTML_PATH
        self.content_timeout = self.settings.CONTENT_TIMEOUT
        self.createPath()

    def createPath(self):
        self.doraemon.createFilePath(self.work_path_prd1)
        self.doraemon.createFilePath(self.log_path)
        self.doraemon.createFilePath(self.finished_backup_folder_path)
        self.doraemon.createFilePath(self.monitor_upload_local)
        self.doraemon.createFilePath(self.local_html_path)

    def parse(self, response):
        time.sleep(1)
        current_url = response['response'].current_url.encode('gbk')
        request_title = response['request_title']
        print 'Start to parse: {0}'.format(current_url)
        page_source = response['response'].page_source
        html = etree.HTML(page_source)
        results = None
        try:
            results = self.callBack(current_url, html, page_source)
            if results == None:
                message1 = 'No content for: {0}'.format(current_url)
                print message1
                self.file.logger(self.log_path, message1)
                return
            dataToMongo = self.doraemon.createSpiderMongoJson(results)
        except Exception as e:
            message1 = 'Exception when parse: {0} for {1}'.format(current_url, e.message)
            print message1
            self.file.logger(self.log_path, message1)
        print 'End to parse: {0}'.format(current_url)
        if results == None:
            self.doraemon.storeFinished(self.doraemon.bf_content, request_title)
            print 'No data for {0}'.format(request_title)
        else:
            message2 = 'Start to store mongo {0}'.format(results.url)
            self.file.logger(self.log_path, message2)
            print message2
            self.doraemon.storeMongodb(self.mongo, dataToMongo)
            message3 = 'End to store mongo {0}'.format(results.url)
            self.file.logger(self.log_path, message3)
            print message3
            self.doraemon.storeTxt(results.id, results.content, self.finished_txt_path, self.name)
            self.doraemon.storeFinished(self.doraemon.bf_content, request_title)

    def start(self):
        if self.doraemon.isSpiderReadyToRun() is False:
            message4 = 'It is not ready to run spider: {0}'.format(self.name)
            print message4
            return
        message5 = 'Start {0} requests'.format(self.name)
        self.file.logger(self.log_path, message5)
        print message5
        message6 = 'Start requests: {0} '.format(self.name)
        self.file.logger(self.log_path, message6)
        print message6
        new_url_titles = self.doraemon.readNewUrls(self.doraemon.bf_content, self.url_path)
        if len(new_url_titles) == 0:
            self.doraemon.recoveryConcurrency(self.concurrency_file_spider, self.max_concurrency_spider)
            message7 = 'No new url for {0}'.format(self.name)
            self.file.logger(self.log_path, message7)
            print message7
            return
        request = BrowserRequest()
        content = request.start_chrome(new_url_titles, self.content_timeout, self.max_pool_size, self.log_path, None, callback=self.parse)
        self.doraemon.recoveryConcurrency(self.concurrency_file_spider, self.max_concurrency_spider)
        message8 = 'End requests for {0}'.format(str(len(content)))
        self.file.logger(self.log_path, message8)
        print message8
        del content, new_url_titles, request
        gc.collect()