コード例 #1
0
class Export(threading.Thread):
    def __init__(self, output_obj):
        threading.Thread.__init__(self)
        self.output_obj = output_obj
        self.file = None
        self.csv_dir = None

    def __init_mongo__(self):
        self.__output__('连接MongoDB...')
        self.mongoDb = Mongo(host=mongoConfig.get_host(),
                             port=mongoConfig.get_port())
        if not self.mongoDb.connect:
            raise MongoDisconnectError()
        self.__output__('MongoDB连接成功!')

    '''
    开始工作
    '''

    def run(self):
        try:
            self.execting = True
            self.__init_mongo__()
            file_name = str(time.time())
            self.file = path.join(directory.export_dir, '%s.xlsx' % file_name)
            self.csv_dir = path.join(directory.export_dir, file_name)
            self.__write_execl_head__()
            logger.debug(TAG, '开始工作')
            if is_null(self.file):
                self.__output__('没有选择文件...')
            self.export_all()
        except MongoDisconnectError:
            logger.debug(TAG, '数据库无法连接')
            self.__output__('数据库无法连接!!!')
        except DatabaseNotFoundError:
            logger.debug(TAG, '无法找到数据库')
            self.__output__('无法找到数据库!!!')
        finally:
            self.execting = False

    def export_all(self):
        self.__output__('导出所有数据...')
        db_name = mongoConfig.get_name()
        if db_name is None:
            self.__output__('请配置数据库!!!')
            return
        collections = self.mongoDb.get_collections(db_name)
        self.__output__('共查找到%d个集合' % len(collections))
        for collection_name in collections:
            self.__output__('开始处理集合:%s' % collection_name)
            collection = self.mongoDb.get_collection(db_name, collection_name)
            for data in collection.find():
                self.handle_data(data)
        self.__output__('处理完成:excel:%s,csv:%s' % (self.file, self.csv_dir))

    def handle_data(self, data):
        if data is None:
            self.__output__("数据为None")
            return
        if 'testCases' in data.keys():
            self.__output__("存在testCases")
            task_id = data['taskid']
            sub_task_id = data['subtaskid']
            sub_sub_task_id = data['subsubtaskid']
            logger.debug(TAG, '数据:%s' % data)
            for step_infos in data['testCases']:
                step_infos = step_infos['stepInfos']
                self.__output__("处理步骤信息,共需处理%d个步骤" % len(step_infos))
                count = 0
                for step_info in step_infos:
                    if 'action' in step_info.keys():
                        action = step_info['action']
                        if 'dataAcquisition' == action and 'extensionFile' in step_info.keys(
                        ) and not is_null(step_info['extensionFile']):
                            record = Record()
                            record.task_id = task_id
                            record.sub_task_id = sub_task_id
                            record.sub_sub_task_id = sub_sub_task_id
                            record.step_id = step_info['stepId']
                            record.extension_file_url = step_info[
                                'extensionFile']
                            self.__handle_record(record)
                            count += 1
                self.__output__("步骤信息处理完成, 找到%d条可用数据" % count)

    def __handle_record(self, record):
        self.__output__('下载文件:%s' % record.extension_file_url)
        record.extension_file_path = self.__download__(
            record.extension_file_url)
        if record.extension_file_path is None:
            self.__output__('文件下载失败')
            return
        self.__output__('记录数据')
        self.__write_excel__(record)
        self.__output__('记录完成')

    def __write_excel__(self, record):
        rexcel = xlrd.open_workbook(self.file)
        rows = rexcel.sheets()[0].nrows
        excel = copy(rexcel)
        sheet = excel.get_sheet(0)
        row = rows
        sheet.write(row, 0, record.task_id)
        sheet.write(row, 1, record.sub_task_id)
        sheet.write(row, 2, record.sub_sub_task_id)
        sheet.write(row, 3, record.step_id)
        sheet.write(row, 4, record.extension_file_url)
        sheet.write(row, 5, record.extension_file_path)
        excel.save(self.file)

    def __write_execl_head__(self):
        book = Workbook()
        sheet1 = book.add_sheet('data')
        sheet1.write(0, 0, '任务id')
        sheet1.write(0, 1, '子任务id')
        sheet1.write(0, 2, '子子任务id')
        sheet1.write(0, 3, '步骤id')
        sheet1.write(0, 4, '数据文件URL')
        sheet1.write(0, 5, '本地文件路径')
        book.save(self.file)

    def __download__(self, url):
        # 获取文件名称,组装路径
        filename = url[url.rfind(r'/') + 1:len(url)]
        if not path.exists(self.csv_dir):
            os.makedirs(self.csv_dir)
        filePath = path.join(self.csv_dir, filename)
        try:
            http = urllib3.PoolManager()
            f = http.request('GET', url)
            with open(filePath, "wb") as d:
                d.write(f.data)
            return filePath
        except:
            exc = traceback.format_exc()
            logger.error(TAG, u'文件下载失败,url:%s' % url, exc)
            return None

    def __output__(self, string):
        logger.debug(TAG, string)
        if self.output_obj is not None:
            self.output_obj.output(string)
コード例 #2
0
class Filter(object):
    def __init__(self, config):
        self.config = config
        self.redis = SRedis(self.config)
        self.mongo = Mongo(self.config)
        # self.oss = Oss(self.config, 'upload-html%d' % random.randint(1,10000))

    def filter_wx_article(self, html, encoding, title=None):
        html = re.sub('<script.*?>[\s\S]+?</script>', '', html)
        html = re.sub('<style[\s\S]*?>[\s\S]+?</style>', '', html)
        html = re.sub('<link.*?>', '', html)
        html = re.sub('<iframe.*?>.*?</iframe>', '', html)

        img_tuple_list = []
        tree = BeautifulSoup(html, 'lxml')
        div = tree.find('div', class_='rich_media_content')
        if div is None:
            return None
        else:
            img_list = div.findAll('img')
            for img in img_list:
                type_ = img.get('data-type', '')
                src = img.get('data-src', '')
                if type_ and src:
                    img_tuple_list.append((src, type_))

        img_name_list = [
            self.get_img_name(*img_tuple) for img_tuple in img_tuple_list
        ]
        indexes = xrange(len(img_name_list))
        html = re.sub('data-src', 'src', html)
        for index in indexes:
            img_path = img_tuple_list[index][0]
            new_img_name = img_name_list[index]
            html = html.replace(img_path, self.config.oss.url + new_img_name)
            need_download_image = "{}|{}".format(new_img_name, img_path)
            self.redis.exec_redis('sadd', 'need_handle_image',
                                  need_download_image)
        # html = re.sub('<div.*?>', '<div>', html)
        html = re.sub('<p.*?>', '<p>', html)
        html = re.sub('<br.*?/>', '', html)
        html = re.sub('<a[\s\S]*?>[\s\S]*?</a>', '', html)
        html = re.sub('<!--[\s\S]+?-->', '', html)
        html = re.sub('<body.*?>', '<body>', html)
        html = re.sub('<section[\s\S]*?>', '<section>', html)
        html = re.sub('<span[\s\S]*?>', '<span>', html)
        if isinstance(html, unicode):
            html = html.encode(encoding)
        return html

    def filter_toutiao_article(self, html, encoding, title=None):
        pattern = r'articleInfo: (\{[\s\S]+?\}),\s*?commentInfo'
        need_data = re.search(pattern, html)
        # html = html.decode(encoding)
        if need_data is None:
            return self.filter_different_toutiao_article(html, encoding, title)
        else:
            html = need_data.groups()[0].decode(encoding)
            return self.filter_office_toutiao_article(html, encoding, title)

    def filter_ifeng_article(self, html, encoding, md5, title=None):
        html = re.sub('<script.*?>[\s\S]+?</script>', '', html)
        html = re.sub('<style[\s\S]*?>[\s\S]+?</style>', '', html)
        html = re.sub('<link.*?>', '', html)
        html = re.sub('<iframe.*?>.*?</iframe>', '', html)
        encoding = 'utf8'
        if html.find('id="main_content"') != -1:
            tree = BeautifulSoup(html, 'lxml')
            title = tree.find('h1').text.encode(encoding)
            original_time = re.search(
                r'(\d{4}年\d{2}月\d{2}日 \d{2}:\d{2}:\d{2})', html).groups()[0]
            div = tree.find('div', id='main_content')
            body = str(div)
            html = '''
            <html>
                <head>
                    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
                    <title>{}</title>
                </head>
                <body>
                    <h1>{}</h1>
                    <div>
                        <div>
                            <span>{}</span>
                        </div>
                        <div>
                            {}
                        </div>
                    </div>
                </body>
            </html>'''.format(title, title, original_time, body)
            return html
        else:
            collection = self.mongo.get_collection(
                collection='article_big_image')
            self.mongo.remove(collection, {'content': md5})

    def filter_office_toutiao_article(self, html, encoding, title):
        title_pattern = r'title:\s*\'([\s\S]+?)\''
        content_pattern = r'content:\s*\'([\s\S]+?)\''
        subInfo_pattern = r'subInfo:\s*(\{([\s\S]+?)\})'
        source_pattern = r'source:\s*\'([\s\S]+?)\''
        time_pattern = r'time:\s*\'([\s\S]+?)\''

        parser = HTMLParser()

        title = re.search(title_pattern, html).groups()[0].encode('utf8')
        content = re.search(content_pattern, html).groups()[0]
        content = parser.unescape(content).encode('utf8')

        subInfo = re.search(subInfo_pattern, html).groups()[0]
        source = re.search(source_pattern, html).groups()[0].encode('utf8')
        time_ = re.search(time_pattern, html).groups()[0]

        collection = self.mongo.get_collection(collection='article_big_image')
        upadte_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        timestamp = int(
            time.mktime(
                datetime.datetime.strptime(time_,
                                           '%Y-%m-%d %H:%M:%S').timetuple()))
        self.mongo.update(
            collection,
            {'$set': {
                'update_at': upadte_time,
                'original_time': timestamp
            }}, {'title': title})
        store_html = '''<html>
            <head>
                <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
                <title>{}</title>
            </head>
            <body>
                <h1>{}</h1>
                <div>
                    <div>
                        <span>{}</span><span>{}</span>
                    </div>
                    <div>
                        {}
                    </div>
                </div>
            </body>
        </html>'''.format(title, title, source, time_, content)

        return store_html

    def filter_different_toutiao_article(self, html, encoding, title):
        return html

    def get_img_name(self, img, type_):
        to_sign = "{}{}".format(img, random.randint(1, 1000))
        md5_sign = hashlib.md5(to_sign).hexdigest()
        img_name = "image/weixin-{}.{}".format(md5_sign, type_)
        return img_name