Esempi in Python per Doraemon.isEmpty

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: middlewares.doraemonMiddleware

Classe/tipologia: Doraemon

Metodo/funzione: isEmpty

Esempi su hotexamples.com: 4

Doraemon.isEmpty in Python: 4 esempi trovati. Questi sono i migliori esempi reali in Python per middlewares.doraemonMiddleware.Doraemon.isEmpty, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

createFilePath(8)

Doraemon(5)

isEmpty(4)

isExceedRestartInterval(4)

storeFinished(3)

readNewImageIds(2)

storeMongodb(2)

getAllHasSet(2)

getDateFromString(2)

hashSet(2)

storeTxt(1)

storeHtml(1)

readNewUrls(1)

isDuplicated(1)

isFileExists(1)

compressImage(1)

getFileSize(1)

downloadImage(1)

delKey(1)

delHashSet(1)

tar(1)

Esempio n. 1

Mostra file

File: sogo_account.py Progetto: hulu7/news

class SogoAccount():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.requests = RequestsMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)
        self.doraemon.createFilePath(self.settings.LOG_PATH)

    def getSettings(self):
        settings_name = self.settings.CreateSettings('sogo')
        self.work_path_prd2 = settings_name['WORK_PATH_PRD2']
        self.mongo = settings_name['MONGO_URLS']
        self.name = settings_name['NAME']
        self.max_pool_size = settings_name['MAX_POOL_SIZE']
        self.log_path = self.settings.LOG_PATH_PRD2
        self.urls = settings_name['URLS']
        self.restart_path = settings_name['RESTART_PATH']
        self.restart_interval = settings_name['RESTART_INTERVAL']
        self.valid_proxy_pool_sogo_account = self.settings.VALID_PROXY_POOL_SOGO_ACCOUNT
        self.invalid_proxy_pool_sogo_account= self.settings.INVALID_PROXY_POOL_SOGO_ACCOUNT
        self.finished_sogo_account = self.settings.FINISHED_SOGO_ACCOUNT
        self.regx = re.compile("[0-9]{1,}.[0-9]{1,}.[0-9]{1,}.[0-9]{1,}:[0-9]{1,}")

    def getProxy(self):
        url = "http://ip.16yun.cn:817/myip/pl/c167cc62-6ad5-4876-bfd8-0cc423dab398/?s=wygafjcqjv&u=hellobee&count=2"
        # url = "http://129.28.124.247:43059/get_ip.php?key=908299fbaefcacef4eb2c9e6ea18c5f2"
        response = self.requests.requests_request(url, headers=None, host="ip.16yun.cn", referer="ip.16yun.cn")
        proxy_list = response.text.strip().split('\n')
        for proxy in proxy_list:
            ip = proxy.strip()
            isValidIp = self.regx.match(ip)
            if self.doraemon.isEmpty(ip) is False and isValidIp is not None:
                self.file.logger(self.log_path, "Proxy: {0} is available.".format(ip))
                print "Proxy: {0} is available.".format(ip)
                try:
                    self.doraemon.hashSet(self.valid_proxy_pool_sogo_account, ip, ip)
                except Exception as e:
                    print "Exception to set redis for available sogo account of ip: {0}: {1}.".format(ip, e.message)
                    self.file.logger(self.log_path, "Exception to set redis for available sogo account of ip: {0}: {1}.".format(ip, e.message))
            else:
                self.file.logger(self.log_path, 'Fail to get proxy for sogo account.')
                print "Fail to get proxy for sogo account."

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        key = response['request_title']
        href_item = html.xpath(".//*[contains(@uigs, 'account_name_0')]/@href")
        if len(href_item) == 0:
            print 'Blocked and change for another proxy.'
            self.doraemon.hashSet(self.invalid_proxy_pool_sogo_account, self.proxy, self.proxy)
            self.doraemon.delHashSet(self.valid_proxy_pool_sogo_account, self.proxy)
            all_valid_proxy = list(self.doraemon.getAllHasSet(self.valid_proxy_pool_sogo_account))
            if len(all_valid_proxy) == 0:
                print 'The proxy pool is empty and get proxy again.'
                self.file.logger(self.log_path, 'The proxy pool is empty and get proxy again.')
                self.getProxy()
            all_valid_proxy = list(self.doraemon.getAllHasSet(self.valid_proxy_pool_sogo_account))
            self.proxy = all_valid_proxy.pop()
            return
        href = href_item[0]
        url = urlparse.urljoin(current_url, href)
        self.doraemon.hashSet(self.name, key, url)
        self.doraemon.hashSet(self.finished_sogo_account, key, key)
        print 'Finished for {0}'.format(key)
        self.current_url.pop()
        if len(self.new_urls) > 0:
            new_url = self.new_urls.pop()
            print 'Start next: {0}'.format(new_url[0])
            self.current_url.append(new_url)
        print 'End to parse {0}, url: {1}'.format(key, href_item[0])

    def start_requests(self):
        if self.doraemon.isExceedRestartInterval(self.restart_path, self.restart_interval) is False:
            return
        self.file.logger(self.log_path, 'Start {0} requests'.format(self.name))
        print 'Start {0} requests'.format(self.name)

        try:
            self.getProxy()
        except Exception as e:
            self.file.logger(self.settings.LOG_PATH, 'Exception to get proxy: {0}'.format(str(e.message)))

        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_sogo_account))
        all_valid_proxy = list(self.doraemon.getAllHasSet(self.valid_proxy_pool_sogo_account))

        if self.doraemon.isEmpty(all_valid_proxy):
            self.file.logger(self.log_path, 'No available proxy for sogo account and return.')
            print "No available proxy for sogo account and return."
            return

        self.new_urls = []
        self.current_url = []

        keys = []
        content = self.file.readFromTxt(self.urls)
        keys_list = content.split('\n')

        for key in keys_list:
            if self.doraemon.isEmpty(key) is False:
                keys.append(key)

        self.proxy = all_valid_proxy.pop()
        for key in keys:
            if key not in all_finished_id:
                timestamp = '00'.join(str(time.time()).split('.'))
                tmp_url = "https://weixin.sogou.com/weixin?type=1&s_from=input&query={0}&ie=utf8&_sug_=n&_sug_type_=".format(key)
                self.new_urls.append([tmp_url, key])

        request = BrowserRequest()
        if len(self.new_urls) > 0:
            self.current_url.append(self.new_urls.pop())
        else:
            print 'No more urls.'
        while len(self.current_url) > 0:
            print "Proxy :{0}".format(self.proxy)
            if len(self.new_urls) > 0:
                self.current_url.append(self.new_urls.pop())
            else:
                print 'No more urls.'
            request.start_chrome(self.current_url, self.max_pool_size, self.log_path, self.proxy, callback=self.parse)

        self.file.logger(self.log_path, 'End for requests of {0}.'.format(self.name))

Esempio n. 2

Mostra file

class Huxiu():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd1)
        self.doraemon.createFilePath(self.settings.LOG_PATH)

    def getSettings(self):
        settings_name = self.settings.CreateSettings('huxiu')
        self.source = settings_name['SOURCE_NAME']
        self.work_path_prd1 = settings_name['WORK_PATH_PRD1']
        self.finished_txt_path = '/home/dev/Data/rsyncData/huxiu_nlp/text/'
        self.url_path = '/home/dev/Data/rsyncData/huxiu_nlp/huxiu_nlp.csv'
        self.mongo = 'huxiu_nlp'
        self.name = settings_name['NAME']
        self.max_pool_size = 4
        self.log_path = self.settings.LOG_PATH
        self.today = self.settings.TODAY
        self.is_open_cache = settings_name['IS_OPEN_CACHE']

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        data = {}
        comment_number = ""
        title = ""
        url = ""
        id = ""
        share_number = ""
        image_url = ""
        content = ""
        time = ""
        author_url = ""
        author_name = ""
        valid = False

        url = current_url
        id = str(filter(str.isdigit, current_url.encode('gbk')))
        title1 = html.xpath(".//*[contains(@class,'t-h1')]/text()")
        comment_number1 = html.xpath(
            ".//*[contains(@class, 'article-pl pull-left')]/text()")
        share_number1 = html.xpath(
            ".//*[contains(@class, 'article-share pull-left')]/text()")
        image_url1 = html.xpath(
            ".//*[contains(@class, 'article-img-box')]/img/@src")
        content1 = html.xpath(
            ".//div[contains(@class, 'article-content-wrap')]//text()")
        time1 = html.xpath(".//*[contains(@class, 'article-time')]/text()")
        author_url1 = html.xpath(
            ".//*[contains(@class, 'author-name')]/a/@href")
        author_name1 = html.xpath(
            ".//*[contains(@class, 'author-name')]/a/text()")

        if self.doraemon.isEmpty(title1) is False:
            title = title1[0].strip()
        if self.doraemon.isEmpty(comment_number1) is False:
            comment_number = str(
                filter(str.isdigit, comment_number1[0].encode('gbk'))).strip()
        if self.doraemon.isEmpty(share_number1) is False:
            share_number = str(
                filter(str.isdigit, share_number1[0].encode('gbk'))).strip()
        if self.doraemon.isEmpty(image_url1) is False:
            image_url = image_url1[0].strip()
        if self.doraemon.isEmpty(content1) is False:
            content = ''.join(content1).strip()
            valid = True
        if self.doraemon.isEmpty(time1) is False:
            time = ''.join(time1).strip()
            time = self.doraemon.getDateFromString(time)
        if self.doraemon.isEmpty(author_url1) is False:
            author_url = urlparse.urljoin(current_url, author_url1[0].strip())
        if self.doraemon.isEmpty(author_name1) is False:
            author_name = ''.join(author_name1[0]).strip()

        data = {
            'title': title,
            'comment_number': comment_number,
            'share_number': share_number,
            'image_url': image_url,
            'url': url,
            'public_time': time,
            'author_url': author_url,
            'author_name': author_name,
            'id': id,
            'download_time': self.today,
            'is_open_cache': self.is_open_cache,
            'source': self.source
        }
        print 'End to parse: {0}'.format(current_url)
        if valid == True and self.doraemon.isEmpty(title) is False:
            self.file.logger(self.log_path,
                             'Start to store mongo {0}'.format(data['url']))
            print 'Start to store mongo {0}'.format(data['url'])
            self.doraemon.storeMongodb(self.mongo, data)
            self.file.logger(self.log_path,
                             'End to store mongo {0}'.format(data['url']))
            print 'End to store mongo {0}'.format(data['url'])
            self.doraemon.storeTxt(id, content, self.finished_txt_path,
                                   self.name)
            self.doraemon.storeFinished(self.doraemon.bf_huxiu_nlp,
                                        response['request_title'])
        else:
            self.doraemon.storeFinished(self.doraemon.bf_huxiu_nlp,
                                        response['request_title'])
        del current_url, html, title, comment_number, share_number, image_url, url, content, time, author_url, author_name, id, data
        gc.collect()

    def start_requests(self):
        self.file.logger(self.log_path, 'Start request: {0}'.format(self.name))
        print 'Start ' + self.name + ' requests'
        new_url_titles = self.doraemon.readNewUrls(self.doraemon.bf_huxiu_nlp,
                                                   self.url_path)
        # new_url_titles = [['https://www.huxiu.com/article/36.html', '【WHAT】十年内10大互联网IPO']]
        if len(new_url_titles) == 0:
            self.file.logger(self.log_path,
                             'No new url for: {0}'.format(self.name))
            print 'No new url for: {0}'.format(self.name)
            return
        request = BrowserRequest()
        content = request.start_chrome(new_url_titles,
                                       self.max_pool_size,
                                       self.log_path,
                                       None,
                                       callback=self.parse)
        self.file.logger(self.log_path,
                         'End requests: {0}'.format(str(len(content))))
        print 'End requests: {0}'.format(str(len(content)))
        del new_url_titles, request, content
        gc.collect()

Esempio n. 3

Mostra file

File: top_baidu_url.py Progetto: hulu7/news

class Topbaidu():
    def __init__(self):

        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)
        self.doraemon.createFilePath(Settings.LOG_PATH)

    def getSettings(self):
        self.work_path_prd2 = settings_name['WORK_PATH_PRD2']
        self.mongo = settings_name['MONGO_URLS']
        self.name = settings_name['NAME']
        self.max_pool_size = settings_name['MAX_POOL_SIZE']
        self.log_path = Settings.LOG_PATH_PRD2
        self.urls = settings_name['URLS']
        self.restart_path = settings_name['RESTART_PATH']
        self.restart_interval = settings_name['RESTART_INTERVAL']
        self.today = Settings.TODAY

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        href_items = html.xpath(".//*[contains(@class, 'article-item-title')]")
        for item in href_items:
            href = item.xpath("@href")
            valid = True
            if len(href) == 0:
                continue
            href_url = href[0]
            hasId = str(filter(str.isdigit, href_url))
            if len(hasId) == 0:
                print 'Invalid url for no id: {0}'.format(href_url)
                continue
            for good in self.goodkeys:
                if valid == True:
                    continue
                if good in href_url:
                    valid = True
            for bad in self.badkeys:
                if valid == False:
                    continue
                if bad in href_url:
                    valid = False
            if valid:
                short_url_parts = re.split(r'[., /, _]', href_url)
                id = short_url_parts[len(short_url_parts) - 1]
                url = urlparse.urljoin(current_url, href_url)
                title = ""
                title_list1 = item.xpath(".//text()")
                if len(title_list1) > 0:
                    title = title_list1[0]
                    print title
                is_title_empty = self.doraemon.isEmpty(title)
                if (is_title_empty is False) and (
                        self.doraemon.isDuplicated(title) is False):
                    data = {
                        'title': title.strip(),
                        'url': url.strip(),
                        'id': id.strip(),
                        'download_time': self.today
                    }
                    self.file.logger(
                        self.log_path,
                        'Start to store mongo {0}'.format(data['url']))
                    print 'Start to store mongo {0}'.format(data['url'])
                    self.doraemon.storeMongodb(self.mongo, data)
                    self.file.logger(
                        self.log_path,
                        'End to store mongo {0}'.format(data['url']))
                    print 'End to store mongo {0}'.format(data['url'])
                    self.file.logger(self.log_path, 'Done for {0}'.format(url))
                else:
                    if is_title_empty is True:
                        self.file.logger(self.log_path,
                                         'Empty title for {0}'.format(url))
                        print 'Empty title for {0}'.format(url)
                    print 'Finished or Empty title for {0}'.format(url)
            else:
                self.file.logger(self.log_path, 'Invalid {0}'.format(href_url))
                print 'Invalid {0}'.format(href_url)
        print 'End to parse {0}'.format(href_url)

    def start_requests(self):
        if self.doraemon.isExceedRestartInterval(
                self.restart_path, self.restart_interval) is False:
            return
        self.file.logger(self.log_path, 'Start {0} requests'.format(self.name))
        print 'Start {0} requests'.format(self.name)
        self.badkeys = []
        self.goodkeys = []

        new_urls = []
        content = self.file.readFromTxt(self.urls)
        url_list = content.split('\n')

        for url in url_list:
            if self.doraemon.isEmpty(url) is False:
                new_urls.append([url, ''])

        if len(new_urls) == 0:
            print 'No url.'
            return

        request = BrowserRequest()
        content = request.start_chrome(new_urls,
                                       self.max_pool_size,
                                       self.log_path,
                                       None,
                                       callback=self.parse)
        self.file.logger(
            self.log_path,
            'End for {0} requests of {1}.'.format(str(len(content)),
                                                  self.name))
        print 'End for {0} requests of {1}.'.format(str(len(content)),
                                                    self.name)

Esempio n. 4

Mostra file

class WeixinSalticidae():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd1)
        self.doraemon.createFilePath(self.settings.LOG_PATH)
        self.doraemon.createFilePath(self.finished_img_path)

    def getSettings(self):
        settings_name = self.settings.CreateSettings('weixin')
        self.source = settings_name['SOURCE_NAME']
        self.work_path_prd1 = settings_name['WORK_PATH_PRD1']
        self.finished_img_path = settings_name['FINISHED_IMG_PATH']
        self.finished_origin_html_path = settings_name[
            'FINISHED_ORIGIN_HTML_PATH']
        self.finished_processed_html_path = settings_name[
            'FINISHED_PROCESSED_HTML_PATH']
        self.finished_content_path = settings_name['FINISHED_CONTENT_PATH']
        self.mongo = settings_name['MONGO']
        self.name = settings_name['NAME']
        self.max_pool_size = settings_name['MAX_POOL_SIZE']
        self.url_deepinews_10002_article = self.settings.URL_DEEPINEWS_10002_ARTICLE
        self.url_deepinews_10002_image = self.settings.URL_DEEPINEWS_10002_IMAGE
        self.log_path = self.settings.LOG_PATH
        self.today = self.settings.TODAY
        self.restart_path = settings_name['RESTART_PATH']
        self.restart_interval = settings_name['RESTART_INTERVAL']
        self.regx_img = re.compile('<img(.*?)/>')
        self.regx_date = re.compile(
            '<em id="publish_time" class="rich_media_meta rich_media_meta_text">(.*?)</em>'
        )
        self.regx_img_type = re.compile('data-type="(.*?)"')
        self.regx_img_data_src = re.compile('data-src="(.*?)"')
        self.regx_img_src = re.compile('src="(.*?)"')
        self.regx_img_class = re.compile('class="(.*?)"')

    def getPostFixOfImage(self, image_type):
        if image_type == 'jpeg':
            return 'jpg'
        if image_type == 'png':
            return 'png'
        if image_type == 'gif':
            return 'gif'
        else:
            print 'Other type: {0}'.format(image_type)

    def start_requests(self):
        self.file.logger(self.log_path,
                         'Start dowload images for: {0} '.format(self.name))
        print 'Start dowload images for: {0} '.format(self.name)
        new_ids = self.doraemon.readNewImageIds(
            self.doraemon.bf_finished_image_id, self.finished_content_path)
        if len(new_ids) == 0:
            self.file.logger(self.log_path,
                             'No new image id for {0}'.format(self.name))
            print 'No new image id for {0}'.format(self.name)
            return
        self.doraemon.createFilePath(self.finished_processed_html_path)
        self.doraemon.createFilePath(self.finished_img_path)
        for id in new_ids:
            print 'Start to remove pictures in: {0}'.format(id)
            html_file = self.file.readFromHtml("{0}/{1}.html".format(
                self.finished_origin_html_path, id))
            img_list = re.findall(self.regx_img, html_file)
            date_list = re.findall(self.regx_date, html_file)
            new_html = ''
            number = 0
            for old_time in date_list:
                new_date = self.doraemon.getDateFromString(old_time)
                old_time_content = '<em id="publish_time" class="rich_media_meta rich_media_meta_text">{0}</em>'.format(
                    old_time)
                new_time_content = '<em id="publish_time" class="rich_media_meta rich_media_meta_text">{0}</em>'.format(
                    new_date)
                new_html = html_file.replace(old_time_content,
                                             new_time_content)
                html_file = new_html
            for img in img_list:
                old_img = img
                image_id = "{0}_{1}".format(id, number)
                image_data_src = ''.join(
                    re.findall(self.regx_img_data_src, img)).strip()
                image_src = re.findall(self.regx_img_src, img)
                image_type = ''.join(re.findall(self.regx_img_type,
                                                img)).strip()
                image_post_fix = self.getPostFixOfImage(image_type)
                if (self.doraemon.isEmpty(image_data_src) is True) or \
                   (self.doraemon.isEmpty(image_src) is True) or \
                   (self.doraemon.isEmpty(image_type) is True):
                    continue
                origin_image_path = "{0}/{1}.{2}".format(
                    self.finished_img_path, image_id, image_post_fix)
                print 'Start to download image: {0}'.format(image_data_src)
                self.doraemon.downloadImage(image_data_src, origin_image_path)
                image_size = self.doraemon.getFileSize(origin_image_path)
                if image_size > 60:
                    print 'Start to compress image: {0}'.format(image_data_src)
                    self.doraemon.compressImage(origin_image_path,
                                                origin_image_path, 2)
                    print 'Finished to compress image: {0}'.format(
                        image_data_src)
                print 'Finished to download image: {0}'.format(image_data_src)
                print 'Start to replace image url: {0}'.format(image_id)
                new_imgurl = "{0}{1}.{2}".format(
                    self.url_deepinews_10002_image, image_id, image_post_fix)
                # new_imgurl = '/home/dev/Data/rsyncData/prd4/weixin/img/{0}.{1}'.format(image_id, image_post_fix)
                src_list = re.findall(self.regx_img_src, img)
                img_class_list = re.findall(self.regx_img_class, img)
                for img_class in img_class_list:
                    new_img = img.replace(img_class, 'rich_pages')
                    img = new_img
                for src in src_list:
                    new_img = img.replace(src, new_imgurl)
                    img = new_img
                new_html = html_file.replace(old_img, img)
                html_file = new_html
                print 'Finished to replace image url: {0}'.format(image_id)
                number += 1
            self.doraemon.storeHtml(id, new_html,
                                    self.finished_processed_html_path)
            self.doraemon.storeFinished(self.doraemon.bf_finished_image_id, id)