Example #1
0
    def test_common_mysql(self):
        mysql_conn = CommonMysql('test').connection()
        mysql_cursor = mysql_conn.cursor()

        mysql_cursor.execute('show databases')
        dbs = mysql_cursor.fetchall()
        self.assertEqual(('test',) in dbs, True)
Example #2
0
    def archive_imgs(absurl,
                     spider_source,
                     spider_url='',
                     table=None,
                     md5=None,
                     image_filename=None,
                     mysql=None,
                     ignore=True,
                     config_file=None):

        table = (table or ImageMagic.IMAGE_TABLE)
        md5 = (md5 or md5_string(absurl))
        image_filename = (image_filename or md5 + DEFAULT_IMG_EXT)
        mysql = (mysql or CommonMysql('question_db_offline',
                                      config_file=config_file))

        _acquireLock()
        try:
            # make sure one connect to be runing at one same time
            mysql.insert(table,
                         dict(url=absurl,
                              source=spider_source,
                              spider_url=spider_url,
                              md5=md5,
                              image_filename=image_filename),
                         ignore=ignore)
        finally:
            _releaseLock()
Example #3
0
    def __init__(self,
                 uri2oss=None,
                 mysql_db=None,
                 download=True,
                 archive_image=True,
                 proxy=False,
                 img_ext=DEFAULT_IMG_EXT,
                 config_file=None,
                 exclude_md5s=None,
                 priority=False):

        self.download = download
        self.archive_image = archive_image
        self.img_ext = img_ext
        self.exclude_md5s = set(
        )  # md5 in which exclude_md5s will not be downloaded
        self.proxy = proxy
        self.priority = priority

        # Strongly recommand to set uri2oss
        # ignoring reloading oss configure file
        self.uri2oss = (uri2oss or Uri2oss())

        if config_file is None:
            top_dir = os.path.dirname(afanti_tiku_lib.__file__)
            config_file = os.path.join(top_dir, DEFAULT_CONFIG_FILE)
        else:
            config_file = config_file

        self.config_file = config_file

        self.mysql_db = (mysql_db or MYSQL_DEFAULT_DB)

        self.mysql = CommonMysql(self.mysql_db, config_file=config_file)
        self.mysql_conn = None

        self.redis_proxy = RedisProxy('image_downloader.',
                                      config_file=config_file)
        self.image_downloader_item_queue = \
            self.redis_proxy.make_list('item_queue')

        self._check_exclude_md5(exclude_md5s)
Example #4
0
async def run(args):
    global mysql
    global mysql_conn

    mysql = CommonMysql(args.db, config_file=args.config_file)
    mysql_conn = mysql.connection()

    ctrl_queue = asyncio.queues.Queue(maxsize=args.ctrl_queue_size)

    max_id = 0
    while True:
        sql = select_sql(args.table,
                         COLS_HEADERS + [cs[0] for cs in args.cols],
                         condition=args.condition.format(max_id))
        rows = execute(mysql_conn, sql)
        if not rows:
            break

        await ctrl_queue.put(None)
        asyncio.ensure_future(
            render_questions(args.table, rows, ctrl_queue, args))

        if args.test:
            break

        max_id = rows[-1][0]
        logger.info('{} [max_id]:{}'.format(args.table, max_id))

    while True:
        logger.info('[ctrl_queue.qsize]: {}'.format(ctrl_queue.qsize()))
        if ctrl_queue.qsize() != 0:
            await asyncio.sleep(1 * 60)
        else:
            # over
            break

    logger.info('# over')
Example #5
0
class ImageMagic(object):

    IMAGE_TABLE = 'image_archive'

    def __init__(self,
                 uri2oss=None,
                 mysql_db=None,
                 download=True,
                 archive_image=True,
                 proxy=False,
                 img_ext=DEFAULT_IMG_EXT,
                 config_file=None,
                 exclude_md5s=None,
                 priority=False):

        self.download = download
        self.archive_image = archive_image
        self.img_ext = img_ext
        self.exclude_md5s = set(
        )  # md5 in which exclude_md5s will not be downloaded
        self.proxy = proxy
        self.priority = priority

        # Strongly recommand to set uri2oss
        # ignoring reloading oss configure file
        self.uri2oss = (uri2oss or Uri2oss())

        if config_file is None:
            top_dir = os.path.dirname(afanti_tiku_lib.__file__)
            config_file = os.path.join(top_dir, DEFAULT_CONFIG_FILE)
        else:
            config_file = config_file

        self.config_file = config_file

        self.mysql_db = (mysql_db or MYSQL_DEFAULT_DB)

        self.mysql = CommonMysql(self.mysql_db, config_file=config_file)
        self.mysql_conn = None

        self.redis_proxy = RedisProxy('image_downloader.',
                                      config_file=config_file)
        self.image_downloader_item_queue = \
            self.redis_proxy.make_list('item_queue')

        self._check_exclude_md5(exclude_md5s)

    def _check_exclude_md5(self, exclude_md5s):
        if exclude_md5s:
            if hasattr(exclude_md5s, '__iter__'):
                for md5 in exclude_md5s:
                    self.exclude_md5s.add(md5)
            else:
                raise TypeError('exclude_md5s must be iterable')

    def bewitch(self,
                html_string,
                spider_url,
                spider_source,
                download=None,
                redownload=False,
                archive_image=None,
                headers=None,
                proxy=None,
                exclude_md5s=None,
                img_ext=None,
                priority=None):

        download = (download, self.download)[download is None]
        archive_image = (archive_image,
                         self.archive_image)[archive_image is None]
        proxy = (proxy or self.proxy)
        img_ext = (img_ext or self.img_ext)
        priority = (priority or self.priority)
        self._check_exclude_md5(exclude_md5s)

        html_string, img_infos = ImageMagic.img_ossify(html_string,
                                                       spider_source,
                                                       uri2oss=self.uri2oss)

        for (ori_url, absurl, md5, ext) in img_infos:

            # if isinstance(img_ext, compat_str):
            # image_filename = md5 + img_ext
            # elif img_ext is True:
            # image_filename = md5 + ext
            # elif img_ext is None:
            # image_filename = md5
            # else:
            # image_filename = md5

            if ext.lower() == '.svg':
                image_filename = md5 + '.svg'
            else:
                image_filename = md5 + DEFAULT_IMG_EXT

            # not save to db
            # images which are in image_archive must been downloaded
            # successfully
            # if archive_image:
            # ImageMagic.archive_imgs(absurl, spider_source,
            # spider_url=spider_url,
            # table=self.IMAGE_TABLE,
            # md5=md5,
            # image_filename=image_filename,
            # mysql=self.mysql,
            # ignore=True,
            # config_file=self.config_file)

            if md5 in self.exclude_md5s:
                continue

            if redownload:
                if not self.mysql_conn:
                    self.mysql_conn = self.mysql.connection()

                sql = 'delete from {} where `md5` = %s'.format(
                    self.IMAGE_TABLE)
                execute(self.mysql_conn, sql, values=(md5, ))

            # send to image_downloader to download
            if download:
                ImageMagic.download_image(
                    absurl,
                    spider_source,
                    image_filename=image_filename,
                    spider_url=spider_url,
                    headers=headers,
                    proxy=proxy,
                    priority=priority,
                    queue=self.image_downloader_item_queue,
                    config_file=self.config_file)
        return html_string

    @staticmethod
    def img_ossify(html_string, spider_source, uri2oss=None):
        '''
        extracting img urls and converting them to oss format
        '''

        uri2oss = (uri2oss or Uri2oss())
        img_infos = extract_image_info_with_md5(html_string)

        for (ori_url, absurl, md5, ext) in img_infos:
            if ext.lower() == '.svg':
                image_filename = md5 + '.svg'
            else:
                image_filename = md5 + DEFAULT_IMG_EXT

            # calculate oss_img_url
            oss_img_url = uri2oss.convert(image_filename, spider_source)
            if oss_img_url is None:
                continue

            # replace original url
            html_string = html_string.replace(ori_url, oss_img_url)

        return html_string, img_infos

    @staticmethod
    def archive_imgs(absurl,
                     spider_source,
                     spider_url='',
                     table=None,
                     md5=None,
                     image_filename=None,
                     mysql=None,
                     ignore=True,
                     config_file=None):

        table = (table or ImageMagic.IMAGE_TABLE)
        md5 = (md5 or md5_string(absurl))
        image_filename = (image_filename or md5 + DEFAULT_IMG_EXT)
        mysql = (mysql or CommonMysql('question_db_offline',
                                      config_file=config_file))

        _acquireLock()
        try:
            # make sure one connect to be runing at one same time
            mysql.insert(table,
                         dict(url=absurl,
                              source=spider_source,
                              spider_url=spider_url,
                              md5=md5,
                              image_filename=image_filename),
                         ignore=ignore)
        finally:
            _releaseLock()

    @staticmethod
    def download_image(absurl,
                       spider_source,
                       image_filename=None,
                       spider_url='',
                       headers=None,
                       proxy=False,
                       priority=False,
                       queue=None,
                       config_file=None):
        '''
        send image info to image_downloader
        '''

        if queue is None:
            redis_proxy = RedisProxy('image_downloader.',
                                     config_file=config_file)
            queue = redis_proxy.make_list('item_queue')

        item = dict(url=absurl,
                    source=spider_source,
                    md5_name=image_filename,
                    spider_url=spider_url,
                    headers=headers,
                    proxy=proxy,
                    priority=priority)
        queue.put(item)
Example #6
0
# -*- coding: utf-8 -*-

import json

from afanti_tiku_lib.dbs.mysql_pool import CommonMysql
from afanti_tiku_lib.dbs.execute import execute
from afanti_tiku_lib.dbs.sql import select_sql

from parser import VkoParser

parser = VkoParser(download=False)

mysql = CommonMysql('html_archive2')
mysql_conn = mysql.connection()


def test():
    sql = select_sql('vko_spider_html_archive_table',
                     ('key', 'html'),
                     condition='where `key` = "vko_qs_970"')
                     # condition='where html_id = 11496')
                     # condition='where html_id > 0 limit 10')
    rows = execute(mysql_conn, sql)

    for row in rows:
        url = row[0]
        js = json.loads(row[1])

        cols = parser.parse(js, url)

        print(json.dumps(cols, indent=4, ensure_ascii=False))