Exemple #1
0
def update(download_media):
    # type: (DownloadMedia) -> long
    condition_dict = {"id": download_media.id}
    sql = build_update_sql('download_media', download_media.from_obj_to_json(), condition_dict)
    client = ConfigInit().get_conn()
    logging.info('update download_media: %d' % download_media.id)
    return client.update(sql)
Exemple #2
0
def select_to_merge():
    sql = """
    select * from crawler_online.download_media where merged_order not in (0,-1) and download_status=1 
    and merged_status is null and download_path!='/data/dev_ant/';
    """
    client = ConfigInit().get_conn()
    return client.getAll(sql)
Exemple #3
0
def insert(download_media):
    # type: (DownloadMedia) -> long
    sql = build_insert_sql('download_media', download_media.from_obj_to_json())
    client = ConfigInit().get_conn()
    download_media_id = client.insertOne(sql)
    logging.info("insert new download_media %d, %s" % (download_media_id, download_media.download_url))
    return download_media_id
Exemple #4
0
def select_not_download_over_file():
    sql = """
    SELECT absolute_path FROM crawler_online.download_media where download_status!=1;
    """
    client = ConfigInit().get_conn()
    all_tuple = client.getAll(sql)
    return [url_dict['absolute_path'] for url_dict in all_tuple] if all_tuple else []
Exemple #5
0
def select_tmp():
    sql = """
    select id,original_url,absolute_path,download_url,media_type,language,file_type 
    from crawler_online.download_media
    """
    client = ConfigInit().get_conn()
    all_tuple = client.getAll(sql)
    return all_tuple
Exemple #6
0
def select_original_url_downloaded_merged_media(urls_list):
    sql = """
    select original_url from crawler_online.download_media
    where media_type='merged' and file_type='mp4' and download_status=1 and original_url in (%s)
    ;""" % ', '.join(map(lambda x: "'%s'" % x, urls_list))
    client = ConfigInit().get_conn()
    urls_tuple = client.getAll(sql)
    return [url_dict['original_url'] for url_dict in urls_tuple] if urls_tuple else []
Exemple #7
0
def select_to_merge():
    sql = """
    select * from download_media where merged_sign in 
    (select merged_sign from crawler_online.download_media where merged_status!=1 and merged_sign!='' and media_type!="%s"
    group by merged_sign having count(merged_sign)>1)
    order by merged_sign ,merged_order;
    """ % consts.constant_manager.MERGED
    client = ConfigInit().get_conn()
    return client.getAll(sql)
Exemple #8
0
def select_original_url_downloaded_video_audio(urls_list):
    sql = """
    select original_url from crawler_online.download_media
    where 
    (media_type='video' or media_type='audio' ) and download_status=1 and original_url in (%s)
    group by original_url having count(original_url)>1;""" % ', '.join(map(lambda x: "'%s'" % x, urls_list))
    client = ConfigInit().get_conn()
    urls_tuple = client.getAll(sql)
    return [url_dict['original_url'] for url_dict in urls_tuple] if urls_tuple else []
Exemple #9
0
def select_original_url_downloaded_subtitle(urls_list):
    sql = """
    select distinct original_url from download_media
    where 
    media_type='subtitle' and download_status=1
    and original_url in (%s);
    """ % ', '.join(map(lambda x: "'%s'" % x, urls_list))
    client = ConfigInit().get_conn()
    urls_tuple = client.getAll(sql)
    return [url_dict['original_url'] for url_dict in urls_tuple] if urls_tuple else []
Exemple #10
0
def save(download_media):
    # type: (DownloadMedia) -> long
    logging.debug('save download_media: %s', download_media.download_url)
    sql_client = ConfigInit().get_conn()
    sql_check = 'select id from download_media where hash_sign="%s"' % download_media.hash_sign
    result = sql_client.getOne(sql_check)
    if result:
        download_media.id = result['id']
        update(download_media)
    else:
        download_media.id = insert(download_media)
    return download_media.id
Exemple #11
0
def scheduler_remote_service(urls):
    remote_ip = ConfigInit().get_config_by_option('remote_ip')
    urls_to_remote = ['http://%s:1080/to_controller?url=%s' % (remote_ip, url) for url in urls]
    for url_to_remote in urls_to_remote:
        response = requests.get(url_to_remote)
        logging.debug(response.text)
    pass
Exemple #12
0
    def process(self, content):
        response_stream = from_string_to_json(get_and_download_stream_obj(content))

        if response_stream['type'] == consts.constant_manager.DOWNLOAD:
            for download_info in response_stream['download_file_list']:
                file_name = get_file_name_by_download_url(download_info['download_url'])
                if download_info['media_type'] == consts.constant_manager.SUBTITLE :
                    file_name = response_stream['site'] + '_' + get_file_name_by_download_url(response_stream['original_url']) + \
                                '_' + download_info['language']
                file_obj = DownloadFile(download_url=download_info['download_url'], file_name=file_name,
                                        site=response_stream['site'], original_url=response_stream['original_url'])

                download_media_json = {
                    'video_url': response_stream['video_url'],
                    'original_url': response_stream['original_url'],
                    'download_url': download_info['download_url'],
                    'media_quality': download_info['media_quality'],
                    'episode': response_stream['episode'],
                    'download_path': ConfigInit().get_config_by_option('download_path'),
                    'media_name': response_stream['media_name'],
                    'hash_sign': get_hash_sign(file_name),
                    'media_type': download_info['media_type'],
                    'site': response_stream['site'],
                    'language': download_info['language'],
                    'merged_sign': download_info['merged_sign'],
                    'merged_order': download_info['merged_order'],
                }
                scheduler_db_save_queue(download_media_json)
                # todo:下载优先级细粒度管理
                if int(download_info['priority']) > 50:
                    scheduler_download_queue(file_obj.from_obj_to_json(), priority=True)
                else:
                    scheduler_download_queue(file_obj.from_obj_to_json())
        return response_stream
Exemple #13
0
 def get_driver(self, name='chrome', type='headless'):
     # todo:内存泄漏问题;各个浏览器配置管理
     deploy_home = ConfigInit().get_conf().get('DEFAULT', 'deploy_home')
     if name == 'phantomjs':
         dcap = dict(DesiredCapabilities.PHANTOMJS)
         dcap["phantomjs.page.settings.userAgent"] = (random.choice(
             consts.USER_AGENTS))
         dcap["phantomjs.page.settings.loadImages"] = False
         driver_phantomjs = webdriver.PhantomJS(
             desired_capabilities=dcap,
             executable_path=deploy_home + '/src/config/phantomjs')
         self._driver[name] = driver_phantomjs
     elif name == 'chrome':
         opts = ChromeOptions()
         opts.add_argument('--no-sandbox')
         opts.add_argument('--disable-dev-shm-usage')
         # opts.add_argument('--proxy-server=http://97.64.40.68:10086')
         dcap = dict(DesiredCapabilities.CHROME)
         dcap["chrome.page.settings.loadImages"] = False
         # PROXY = '97.64.40.68:10086'
         # dcap['proxy'] = {
         #     "httpProxy": PROXY,
         #     "ftpProxy": PROXY,
         #     "sslProxy": PROXY,
         #     "noProxy": None,
         #     "proxyType": "MANUAL",
         #     "class": "org.openqa.selenium.Proxy",
         #     "autodetect": False
         # }
         if type == 'headless':
             opts.add_argument("--headless")
         chrome_driver = webdriver.Chrome(
             desired_capabilities=dcap,
             executable_path=deploy_home +
             ConfigInit().get_config_by_option('chrome_path'),
             chrome_options=opts)
         self._driver[name] = chrome_driver
     elif name == 'firefox':
         opts = FirefoxOptions()
         if type == 'headless':
             opts.add_argument("--headless")
         firefox_driver = webdriver.Firefox(executable_path=deploy_home +
                                            '/src/config/geckodriver_mac',
                                            firefox_options=opts)
         self._driver[name] = firefox_driver
     return self._driver[name]
Exemple #14
0
def demo_del_file():
    # 删除本地下载国产剧
    urls = [url.replace('\n', '') for url in open('/data/my_ant/play_urls1')]
    sql = """
    SELECT absolute_path FROM crawler_online.download_media where download_status=1 and download_path='/data/dev_ant/' and 
    original_url in (%s);
    """ % ', '.join(map(lambda x: "'%s'" % x, urls))
    client = ConfigInit().get_conn()
    all_tuple = client.getAll(sql)
    dalu_files_local = [url_dict['absolute_path']
                        for url_dict in all_tuple] if all_tuple else []
    for file in dalu_files_local:
        try:
            if os.path.exists(file):
                del_file(file)
        except:
            traceback.print_exc()
    pass
Exemple #15
0
 def get_driver(self, name='chrome', type='headless'):
     # todo:内存泄漏问题;各个浏览器配置管理
     self._instance_lock.acquire()
     if name in self._driver.keys():
         return self._driver[name]
     self._instance_lock.release()
     deploy_home = ConfigInit().get_conf().get('DEFAULT', 'deploy_home')
     if name == 'phantomjs':
         dcap = dict(DesiredCapabilities.PHANTOMJS)
         dcap["phantomjs.page.settings.userAgent"] = (random.choice(
             consts.USER_AGENTS))
         dcap["phantomjs.page.settings.loadImages"] = False
         driver_phantomjs = webdriver.PhantomJS(
             desired_capabilities=dcap,
             executable_path=deploy_home + '/src/config/phantomjs')
         self._driver[name] = driver_phantomjs
         return driver_phantomjs
     elif name == 'chrome':
         opts = ChromeOptions()
         opts.add_argument('--no-sandbox')
         opts.add_argument('--disable-dev-shm-usage')
         dcap = dict(DesiredCapabilities.CHROME)
         dcap["chrome.page.settings.loadImages"] = False
         if type == 'headless':
             opts.add_argument("--headless")
         chrome_driver = webdriver.Chrome(
             desired_capabilities=dcap,
             executable_path=deploy_home +
             ConfigInit().get_config_by_option('chrome_path'),
             chrome_options=opts)
         self._driver[name] = chrome_driver
         return chrome_driver
     elif name == 'firefox':
         opts = FirefoxOptions()
         if type == 'headless':
             opts.add_argument("--headless")
         firefox_driver = webdriver.Firefox(executable_path=deploy_home +
                                            '/src/config/geckodriver_mac',
                                            firefox_options=opts)
         self._driver[name] = firefox_driver
         return firefox_driver
Exemple #16
0
 def merge_media(self, to_merged_medias_lists):
     # to_merged_medias_dict:[download_media_obj1,download_media_obj2,......]
     inputs = {}
     merged_absolue_path = ''.join([
         ConfigInit().get_download_path(),
         hash_md5(to_merged_medias_lists[0]['merged_sign']), '.mp4'
     ])
     outputs = {merged_absolue_path: '-c copy'}
     # todo:强制ffmpeg合并处理
     if exist_file(merged_absolue_path):
         del_file(merged_absolue_path)
     for to_merged_media_dict in to_merged_medias_lists:
         inputs[to_merged_media_dict['absolute_path']] = ''
     ff = FFmpeg(inputs=inputs, outputs=outputs)
     ff.run()
     return merged_absolue_path
Exemple #17
0
 def pre_parse_download_obj(self, download_file_obj):
     # type: (DownloadFile) -> None
     """
     :param download_file_obj:
     :return:
     """
     try:
         headers = {
             'User-Agent':
             random.choice(consts.constant_manager.USER_AGENTS)
         }
         response = requests.get(download_file_obj.download_url,
                                 stream=True,
                                 headers=headers)
         if response.status_code == 400:
             logging.error('invalid timestamp %s' %
                           download_file_obj.download_url)
             return False
         headers_json = dict(response.headers)
         if 'mp4' in headers_json['Content-Type']:
             download_file_obj.file_type = 'mp4'
         elif 'text' in headers_json['Content-Type']:
             download_file_obj.file_type = 'txt'
         else:
             logging.error('unknow file_type in %s' %
                           download_file_obj.download_url)
             return False
         download_file_obj.total_size = int(headers_json['Content-Length'])
     except:
         traceback.print_exc()
         logging.error('pre_parse_download_obj error download_url %s' %
                       download_file_obj.download_url)
     if download_file_obj.file_name == '':
         download_file_obj.file_name = get_file_name_by_download_url(
             download_file_obj.download_url)
     download_file_obj.hash_sign = get_hash_sign(
         file_name=download_file_obj.file_name)
     if download_file_obj.download_path == '':
         download_file_obj.download_path = ConfigInit().get_download_path()
     # todo:gzip压缩文件的续下载问题
     if 'Content-Encoding' in headers_json and headers_json[
             'Content-Encoding'] == 'gzip':
         download_file_obj.download_type = consts.constant_manager.RE_DOWNLOAD
     download_file_obj.absolute_path = download_file_obj.download_path + download_file_obj.file_name + '.' + download_file_obj.file_type
     return True
Exemple #18
0
def demo_browser():
    deploy_home = ConfigInit().get_conf().get('DEFAULT', 'deploy_home')
    opts = ChromeOptions()
    # opts.binary_location = '/usr/bin/google-chrome'
    opts.add_argument("--headless")
    opts.add_argument('--no-sandbox')
    opts.add_argument('--disable-dev-shm-usage')
    dcap = dict(DesiredCapabilities.CHROME)
    dcap["chrome.page.settings.loadImages"] = False
    chrome_driver = webdriver.Chrome(desired_capabilities=dcap,
                                     executable_path=deploy_home +
                                     '/src/config/chromedriver_mac243',
                                     chrome_options=opts)
    chrome_driver.set_page_load_timeout(3)
    try:
        chrome_driver.get(
            'https://www.viki.com/videos/170494v-dream-high-2-episode-5')
    except TimeoutException:
        traceback.print_exc()
    print(chrome_driver.page_source)
    pass
Exemple #19
0
 def pre_parse_download_obj(self, download_file_obj):
     # type: (DownloadFile) -> None
     """
     :param download_file_obj:
     :return:
     """
     headers = {
         'User-Agent': random.choice(consts.constant_manager.USER_AGENTS)
     }
     response = requests.get(download_file_obj.download_url,
                             stream=True,
                             headers=headers)
     headers_json = dict(response.headers)
     if 'mp4' in headers_json['Content-Type']:
         download_file_obj.file_type = 'mp4'
     elif 'text' in headers_json['Content-Type']:
         download_file_obj.file_type = 'txt'
     else:
         logging.error('unknow file_type in %s' %
                       download_file_obj.download_url)
     try:
         download_file_obj.total_size = int(headers_json['Content-Length'])
     except:
         logging.error('can not get total_size from download_url %s' %
                       download_file_obj.download_url)
     if download_file_obj.file_name == '':
         download_file_obj.file_name = hash_md5(
             download_file_obj.download_url)
     download_file_obj.hash_sign = get_hash_sign(
         file_name=download_file_obj.file_name)
     if download_file_obj.download_path == '':
         download_file_obj.download_path = ConfigInit().get_download_path()
     # todo:gzip压缩文件的续下载问题
     if 'Content-Encoding' in headers_json and headers_json[
             'Content-Encoding'] == 'gzip':
         download_file_obj.download_type = 'wb+'
     download_file_obj.absolute_path = download_file_obj.download_path + download_file_obj.file_name + '.' + download_file_obj.file_type
Exemple #20
0
 def __init__(self,
              host=ConfigInit().get_config_by_option('redis_ip'),
              port=6379):
     self.__db = None
     self.init(host, port)
Exemple #21
0
from flask import Flask, request, jsonify

from src.tools import logger

app = Flask(__name__)


# todo:高并发接口处理
@app.route('/to_controller', methods=['GET'])
def to_controller():
    result_dict = {
        "info": "to_controller",
        "state": "success",
        "url": "",
    }
    if not request.args or 'url' not in request.args:
        result_dict[
            'info'] = 'no url or url is wrong,ip is %s' % request.remote_addr
        result_dict['state'] = 'false'
    scheduler_controller_queue(request.args['url'])
    result_dict['url'] = request.args['url']
    return jsonify(result_dict)


if __name__ == "__main__":
    logger.init_log()
    # 将host设置为0.0.0.0,则外网用户也可以访问到这个服务
    app.run(host=ConfigInit().get_config_by_option('service_ip'),
            port=8080,
            debug=True)
Exemple #22
0
    def download_media(self, download_file_obj):
        # type: (DownloadFile) -> None
        """
        同样下载文件的定义:如果是一样路径一样名称,一样大小,则认为同样文件
        有没有下载同样的链接,存储在对方端上本地
        只负责下载和续下载,不做其他逻辑处理
        :param download_file_obj:name,file_type,file_path均可以给出默认值,original_url
        :return:
        """
        if not self.pre_parse_download_obj(download_file_obj):
            logging.error('no need to download url %s' %
                          download_file_obj.download_url)
            return

        download_media_json = {
            'hash_sign': download_file_obj.hash_sign,
            'total_size': download_file_obj.total_size,
            'download_url': download_file_obj.download_url,
            'absolute_path': download_file_obj.absolute_path,
            'file_type': download_file_obj.file_type,
            'download_status': consts.constant_manager.NOT_DOWNLOAD_OVER,
        }

        if self.download_over(download_file_obj):
            download_media_json[
                'download_status'] = consts.constant_manager.DOWNLOAD_OVER
            if ConfigInit().get_config_by_option('save_db'):
                logging.debug('save media download_url %s, hash_sign %s' %
                              (download_file_obj.download_url,
                               download_file_obj.hash_sign))
                scheduler_db_save_queue(download_media_json)
            return
        elif os.path.exists(download_file_obj.absolute_path):
            temp_size = os.path.getsize(download_file_obj.absolute_path)
        else:
            temp_size = 0
        headers = {
            'User-Agent': random.choice(consts.constant_manager.USER_AGENTS)
        }
        headers.update({'Range': 'bytes=%d-' % temp_size})

        with closing(
                requests.get(download_file_obj.download_url,
                             stream=True,
                             headers=headers)) as response:
            with open(download_file_obj.absolute_path,
                      download_file_obj.download_type) as file:
                chunk_size = 1024
                progress = ProgressBar(
                    download_file_obj.hash_sign,
                    download_file_obj.absolute_path,
                    total=download_file_obj.total_size,
                    now_size=float(temp_size),
                    last_size=float(temp_size),
                    unit="KB",
                    chunk_size=chunk_size,
                    status=consts.constant_manager.DOWNLOADING)
                scheduler_db_save_queue(download_media_json)
                for data in response.iter_content(chunk_size=chunk_size):
                    file.write(data)
                    # if '需要暂停':
                    #     self.pause()
                    progress.refresh(count=len(data))

            progress.refresh(
                status=consts.constant_manager.DOWNLOAD_OVER)  # 下载队列完成下载

            if self.download_over(download_file_obj):
                download_media_json[
                    'download_status'] = consts.constant_manager.DOWNLOAD_OVER
                if ConfigInit().get_config_by_option('save_db'):
                    logging.debug(
                        'download over download_url %s, hash_sign %s' %
                        (download_file_obj.download_url,
                         download_file_obj.hash_sign))
                    scheduler_db_save_queue(download_media_json)
            else:
                logging.debug(
                    'not download over download_url %s, hash_sign %s' %
                    (download_file_obj.download_url,
                     download_file_obj.hash_sign))
Exemple #23
0
sys.path.append('../')
from src.tools.config_manager import ConfigInit
from app.scheduler import scheduler_controller_queue
from flask import Flask, request, jsonify

from src.tools import logger

app = Flask(__name__)


# todo:高并发接口处理
@app.route('/to_controller', methods=['GET'])
def to_controller():
    result_dict = {
        "info": "to_controller",
        "state": "success",
        "url": "",
    }
    if not request.args or 'url' not in request.args:
        result_dict['info'] = 'no url or url is wrong,ip is %s' % request.remote_addr
        result_dict['state'] = 'false'
    scheduler_controller_queue(request.args['url'])
    result_dict['url'] = request.args['url']
    return jsonify(result_dict)


if __name__ == "__main__":
    logger.init_log()
    # 将host设置为0.0.0.0,则外网用户也可以访问到这个服务
    app.run(host=ConfigInit().get_config_by_option('service_ip'), port=1080, debug=True)
Exemple #24
0
def demo_mysql():
    conn = ConfigInit().get_conn()
    aa = conn.getOne('select count(*) from download_media')
    pass
Exemple #25
0
    def download_media(self, download_file_obj):
        # type: (DownloadFile) -> None
        """
        同样下载文件的定义:如果是一样路径一样名称,一样大小,则认为同样文件
        有没有下载同样的链接,存储在对方端上本地
        只负责下载和续下载,不做其他逻辑处理
        todo:多线程下载同一队列大文件支持,自动创建默认下载路径,字幕下载的文件URl是不同的(暂时无法续下载)
        :param download_file_obj:name,file_type,file_path均可以给出默认值,original_url
        :return:
        """
        new_download_file_obj = download_file_obj
        self.pre_parse_download_obj(download_file_obj)

        download_media_json = {
            'hash_sign': download_file_obj.hash_sign,
            'total_size': download_file_obj.total_size,
            'download_url': download_file_obj.download_url,
            'absolute_path': download_file_obj.absolute_path,
            'file_type': download_file_obj.file_type,
            'download_status': consts.constant_manager.NOT_DOWNLOAD_OVER,
        }

        if exist_file(download_file_obj.absolute_path) and get_file_size(
                download_file_obj.absolute_path
        ) == download_file_obj.total_size:
            logging.debug('same file %s' % download_file_obj.absolute_path)
            download_media_json[
                'download_status'] = consts.constant_manager.DOWNLOAD_OVER
            if ConfigInit().get_config_by_option('save_db'):
                scheduler_db_save_queue(download_media_json)
            return
        elif os.path.exists(download_file_obj.absolute_path):
            temp_size = os.path.getsize(download_file_obj.absolute_path)
        else:
            temp_size = 0
        headers = {
            'User-Agent': random.choice(consts.constant_manager.USER_AGENTS)
        }
        headers.update({'Range': 'bytes=%d-' % temp_size})

        with closing(
                requests.get(download_file_obj.download_url,
                             stream=True,
                             headers=headers)) as response:
            chunk_size = 1024
            progress = ProgressBar(download_file_obj.absolute_path,
                                   total=download_file_obj.total_size,
                                   unit="KB",
                                   chunk_size=chunk_size,
                                   run_status="正在下载",
                                   fin_status="下载完成")

            with open(download_file_obj.absolute_path,
                      download_file_obj.download_type) as file:
                # todo:通用下载器,调整此处逻辑
                download_file_status_json = copy.deepcopy(download_file_status)
                download_file_status_json[
                    'hash_sign'] = download_file_obj.hash_sign
                download_file_status_json[
                    'total_size'] = download_file_obj.total_size
                logging.debug('start download file %s' %
                              download_file_obj.download_url)
                scheduler_download_status_queue(download_file_status_json)
                for data in response.iter_content(chunk_size=chunk_size):
                    file.write(data)
                    progress.refresh(count=len(data))
                    # todo:if发现优先级高的下载队列,self.pause()

        download_media_json['download_status'] = 1
        if ConfigInit().get_config_by_option('save_db'):
            scheduler_db_save_queue(download_media_json)
Exemple #26
0
def get_hash_sign(file_name):
    save_path = ConfigInit().get_config_by_option('download_path')
    return hash_md5(save_path + file_name)
Exemple #27
0
def select_by_hash_sign(hash_sign):
    # type: (str) -> long
    sql = 'select id from download_media where hash_sign="%s" and download_status=1' % hash_sign
    client = ConfigInit().get_conn()
    return client.getOne(sql)