def update(download_media): # type: (DownloadMedia) -> long condition_dict = {"id": download_media.id} sql = build_update_sql('download_media', download_media.from_obj_to_json(), condition_dict) client = ConfigInit().get_conn() logging.info('update download_media: %d' % download_media.id) return client.update(sql)
def select_to_merge(): sql = """ select * from crawler_online.download_media where merged_order not in (0,-1) and download_status=1 and merged_status is null and download_path!='/data/dev_ant/'; """ client = ConfigInit().get_conn() return client.getAll(sql)
def insert(download_media): # type: (DownloadMedia) -> long sql = build_insert_sql('download_media', download_media.from_obj_to_json()) client = ConfigInit().get_conn() download_media_id = client.insertOne(sql) logging.info("insert new download_media %d, %s" % (download_media_id, download_media.download_url)) return download_media_id
def select_not_download_over_file(): sql = """ SELECT absolute_path FROM crawler_online.download_media where download_status!=1; """ client = ConfigInit().get_conn() all_tuple = client.getAll(sql) return [url_dict['absolute_path'] for url_dict in all_tuple] if all_tuple else []
def select_tmp(): sql = """ select id,original_url,absolute_path,download_url,media_type,language,file_type from crawler_online.download_media """ client = ConfigInit().get_conn() all_tuple = client.getAll(sql) return all_tuple
def select_original_url_downloaded_merged_media(urls_list): sql = """ select original_url from crawler_online.download_media where media_type='merged' and file_type='mp4' and download_status=1 and original_url in (%s) ;""" % ', '.join(map(lambda x: "'%s'" % x, urls_list)) client = ConfigInit().get_conn() urls_tuple = client.getAll(sql) return [url_dict['original_url'] for url_dict in urls_tuple] if urls_tuple else []
def select_to_merge(): sql = """ select * from download_media where merged_sign in (select merged_sign from crawler_online.download_media where merged_status!=1 and merged_sign!='' and media_type!="%s" group by merged_sign having count(merged_sign)>1) order by merged_sign ,merged_order; """ % consts.constant_manager.MERGED client = ConfigInit().get_conn() return client.getAll(sql)
def select_original_url_downloaded_video_audio(urls_list): sql = """ select original_url from crawler_online.download_media where (media_type='video' or media_type='audio' ) and download_status=1 and original_url in (%s) group by original_url having count(original_url)>1;""" % ', '.join(map(lambda x: "'%s'" % x, urls_list)) client = ConfigInit().get_conn() urls_tuple = client.getAll(sql) return [url_dict['original_url'] for url_dict in urls_tuple] if urls_tuple else []
def select_original_url_downloaded_subtitle(urls_list): sql = """ select distinct original_url from download_media where media_type='subtitle' and download_status=1 and original_url in (%s); """ % ', '.join(map(lambda x: "'%s'" % x, urls_list)) client = ConfigInit().get_conn() urls_tuple = client.getAll(sql) return [url_dict['original_url'] for url_dict in urls_tuple] if urls_tuple else []
def save(download_media): # type: (DownloadMedia) -> long logging.debug('save download_media: %s', download_media.download_url) sql_client = ConfigInit().get_conn() sql_check = 'select id from download_media where hash_sign="%s"' % download_media.hash_sign result = sql_client.getOne(sql_check) if result: download_media.id = result['id'] update(download_media) else: download_media.id = insert(download_media) return download_media.id
def scheduler_remote_service(urls): remote_ip = ConfigInit().get_config_by_option('remote_ip') urls_to_remote = ['http://%s:1080/to_controller?url=%s' % (remote_ip, url) for url in urls] for url_to_remote in urls_to_remote: response = requests.get(url_to_remote) logging.debug(response.text) pass
def process(self, content): response_stream = from_string_to_json(get_and_download_stream_obj(content)) if response_stream['type'] == consts.constant_manager.DOWNLOAD: for download_info in response_stream['download_file_list']: file_name = get_file_name_by_download_url(download_info['download_url']) if download_info['media_type'] == consts.constant_manager.SUBTITLE : file_name = response_stream['site'] + '_' + get_file_name_by_download_url(response_stream['original_url']) + \ '_' + download_info['language'] file_obj = DownloadFile(download_url=download_info['download_url'], file_name=file_name, site=response_stream['site'], original_url=response_stream['original_url']) download_media_json = { 'video_url': response_stream['video_url'], 'original_url': response_stream['original_url'], 'download_url': download_info['download_url'], 'media_quality': download_info['media_quality'], 'episode': response_stream['episode'], 'download_path': ConfigInit().get_config_by_option('download_path'), 'media_name': response_stream['media_name'], 'hash_sign': get_hash_sign(file_name), 'media_type': download_info['media_type'], 'site': response_stream['site'], 'language': download_info['language'], 'merged_sign': download_info['merged_sign'], 'merged_order': download_info['merged_order'], } scheduler_db_save_queue(download_media_json) # todo:下载优先级细粒度管理 if int(download_info['priority']) > 50: scheduler_download_queue(file_obj.from_obj_to_json(), priority=True) else: scheduler_download_queue(file_obj.from_obj_to_json()) return response_stream
def get_driver(self, name='chrome', type='headless'): # todo:内存泄漏问题;各个浏览器配置管理 deploy_home = ConfigInit().get_conf().get('DEFAULT', 'deploy_home') if name == 'phantomjs': dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = (random.choice( consts.USER_AGENTS)) dcap["phantomjs.page.settings.loadImages"] = False driver_phantomjs = webdriver.PhantomJS( desired_capabilities=dcap, executable_path=deploy_home + '/src/config/phantomjs') self._driver[name] = driver_phantomjs elif name == 'chrome': opts = ChromeOptions() opts.add_argument('--no-sandbox') opts.add_argument('--disable-dev-shm-usage') # opts.add_argument('--proxy-server=http://97.64.40.68:10086') dcap = dict(DesiredCapabilities.CHROME) dcap["chrome.page.settings.loadImages"] = False # PROXY = '97.64.40.68:10086' # dcap['proxy'] = { # "httpProxy": PROXY, # "ftpProxy": PROXY, # "sslProxy": PROXY, # "noProxy": None, # "proxyType": "MANUAL", # "class": "org.openqa.selenium.Proxy", # "autodetect": False # } if type == 'headless': opts.add_argument("--headless") chrome_driver = webdriver.Chrome( desired_capabilities=dcap, executable_path=deploy_home + ConfigInit().get_config_by_option('chrome_path'), chrome_options=opts) self._driver[name] = chrome_driver elif name == 'firefox': opts = FirefoxOptions() if type == 'headless': opts.add_argument("--headless") firefox_driver = webdriver.Firefox(executable_path=deploy_home + '/src/config/geckodriver_mac', firefox_options=opts) self._driver[name] = firefox_driver return self._driver[name]
def demo_del_file(): # 删除本地下载国产剧 urls = [url.replace('\n', '') for url in open('/data/my_ant/play_urls1')] sql = """ SELECT absolute_path FROM crawler_online.download_media where download_status=1 and download_path='/data/dev_ant/' and original_url in (%s); """ % ', '.join(map(lambda x: "'%s'" % x, urls)) client = ConfigInit().get_conn() all_tuple = client.getAll(sql) dalu_files_local = [url_dict['absolute_path'] for url_dict in all_tuple] if all_tuple else [] for file in dalu_files_local: try: if os.path.exists(file): del_file(file) except: traceback.print_exc() pass
def get_driver(self, name='chrome', type='headless'): # todo:内存泄漏问题;各个浏览器配置管理 self._instance_lock.acquire() if name in self._driver.keys(): return self._driver[name] self._instance_lock.release() deploy_home = ConfigInit().get_conf().get('DEFAULT', 'deploy_home') if name == 'phantomjs': dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = (random.choice( consts.USER_AGENTS)) dcap["phantomjs.page.settings.loadImages"] = False driver_phantomjs = webdriver.PhantomJS( desired_capabilities=dcap, executable_path=deploy_home + '/src/config/phantomjs') self._driver[name] = driver_phantomjs return driver_phantomjs elif name == 'chrome': opts = ChromeOptions() opts.add_argument('--no-sandbox') opts.add_argument('--disable-dev-shm-usage') dcap = dict(DesiredCapabilities.CHROME) dcap["chrome.page.settings.loadImages"] = False if type == 'headless': opts.add_argument("--headless") chrome_driver = webdriver.Chrome( desired_capabilities=dcap, executable_path=deploy_home + ConfigInit().get_config_by_option('chrome_path'), chrome_options=opts) self._driver[name] = chrome_driver return chrome_driver elif name == 'firefox': opts = FirefoxOptions() if type == 'headless': opts.add_argument("--headless") firefox_driver = webdriver.Firefox(executable_path=deploy_home + '/src/config/geckodriver_mac', firefox_options=opts) self._driver[name] = firefox_driver return firefox_driver
def merge_media(self, to_merged_medias_lists): # to_merged_medias_dict:[download_media_obj1,download_media_obj2,......] inputs = {} merged_absolue_path = ''.join([ ConfigInit().get_download_path(), hash_md5(to_merged_medias_lists[0]['merged_sign']), '.mp4' ]) outputs = {merged_absolue_path: '-c copy'} # todo:强制ffmpeg合并处理 if exist_file(merged_absolue_path): del_file(merged_absolue_path) for to_merged_media_dict in to_merged_medias_lists: inputs[to_merged_media_dict['absolute_path']] = '' ff = FFmpeg(inputs=inputs, outputs=outputs) ff.run() return merged_absolue_path
def pre_parse_download_obj(self, download_file_obj): # type: (DownloadFile) -> None """ :param download_file_obj: :return: """ try: headers = { 'User-Agent': random.choice(consts.constant_manager.USER_AGENTS) } response = requests.get(download_file_obj.download_url, stream=True, headers=headers) if response.status_code == 400: logging.error('invalid timestamp %s' % download_file_obj.download_url) return False headers_json = dict(response.headers) if 'mp4' in headers_json['Content-Type']: download_file_obj.file_type = 'mp4' elif 'text' in headers_json['Content-Type']: download_file_obj.file_type = 'txt' else: logging.error('unknow file_type in %s' % download_file_obj.download_url) return False download_file_obj.total_size = int(headers_json['Content-Length']) except: traceback.print_exc() logging.error('pre_parse_download_obj error download_url %s' % download_file_obj.download_url) if download_file_obj.file_name == '': download_file_obj.file_name = get_file_name_by_download_url( download_file_obj.download_url) download_file_obj.hash_sign = get_hash_sign( file_name=download_file_obj.file_name) if download_file_obj.download_path == '': download_file_obj.download_path = ConfigInit().get_download_path() # todo:gzip压缩文件的续下载问题 if 'Content-Encoding' in headers_json and headers_json[ 'Content-Encoding'] == 'gzip': download_file_obj.download_type = consts.constant_manager.RE_DOWNLOAD download_file_obj.absolute_path = download_file_obj.download_path + download_file_obj.file_name + '.' + download_file_obj.file_type return True
def demo_browser(): deploy_home = ConfigInit().get_conf().get('DEFAULT', 'deploy_home') opts = ChromeOptions() # opts.binary_location = '/usr/bin/google-chrome' opts.add_argument("--headless") opts.add_argument('--no-sandbox') opts.add_argument('--disable-dev-shm-usage') dcap = dict(DesiredCapabilities.CHROME) dcap["chrome.page.settings.loadImages"] = False chrome_driver = webdriver.Chrome(desired_capabilities=dcap, executable_path=deploy_home + '/src/config/chromedriver_mac243', chrome_options=opts) chrome_driver.set_page_load_timeout(3) try: chrome_driver.get( 'https://www.viki.com/videos/170494v-dream-high-2-episode-5') except TimeoutException: traceback.print_exc() print(chrome_driver.page_source) pass
def pre_parse_download_obj(self, download_file_obj): # type: (DownloadFile) -> None """ :param download_file_obj: :return: """ headers = { 'User-Agent': random.choice(consts.constant_manager.USER_AGENTS) } response = requests.get(download_file_obj.download_url, stream=True, headers=headers) headers_json = dict(response.headers) if 'mp4' in headers_json['Content-Type']: download_file_obj.file_type = 'mp4' elif 'text' in headers_json['Content-Type']: download_file_obj.file_type = 'txt' else: logging.error('unknow file_type in %s' % download_file_obj.download_url) try: download_file_obj.total_size = int(headers_json['Content-Length']) except: logging.error('can not get total_size from download_url %s' % download_file_obj.download_url) if download_file_obj.file_name == '': download_file_obj.file_name = hash_md5( download_file_obj.download_url) download_file_obj.hash_sign = get_hash_sign( file_name=download_file_obj.file_name) if download_file_obj.download_path == '': download_file_obj.download_path = ConfigInit().get_download_path() # todo:gzip压缩文件的续下载问题 if 'Content-Encoding' in headers_json and headers_json[ 'Content-Encoding'] == 'gzip': download_file_obj.download_type = 'wb+' download_file_obj.absolute_path = download_file_obj.download_path + download_file_obj.file_name + '.' + download_file_obj.file_type
def __init__(self, host=ConfigInit().get_config_by_option('redis_ip'), port=6379): self.__db = None self.init(host, port)
from flask import Flask, request, jsonify from src.tools import logger app = Flask(__name__) # todo:高并发接口处理 @app.route('/to_controller', methods=['GET']) def to_controller(): result_dict = { "info": "to_controller", "state": "success", "url": "", } if not request.args or 'url' not in request.args: result_dict[ 'info'] = 'no url or url is wrong,ip is %s' % request.remote_addr result_dict['state'] = 'false' scheduler_controller_queue(request.args['url']) result_dict['url'] = request.args['url'] return jsonify(result_dict) if __name__ == "__main__": logger.init_log() # 将host设置为0.0.0.0,则外网用户也可以访问到这个服务 app.run(host=ConfigInit().get_config_by_option('service_ip'), port=8080, debug=True)
def download_media(self, download_file_obj): # type: (DownloadFile) -> None """ 同样下载文件的定义:如果是一样路径一样名称,一样大小,则认为同样文件 有没有下载同样的链接,存储在对方端上本地 只负责下载和续下载,不做其他逻辑处理 :param download_file_obj:name,file_type,file_path均可以给出默认值,original_url :return: """ if not self.pre_parse_download_obj(download_file_obj): logging.error('no need to download url %s' % download_file_obj.download_url) return download_media_json = { 'hash_sign': download_file_obj.hash_sign, 'total_size': download_file_obj.total_size, 'download_url': download_file_obj.download_url, 'absolute_path': download_file_obj.absolute_path, 'file_type': download_file_obj.file_type, 'download_status': consts.constant_manager.NOT_DOWNLOAD_OVER, } if self.download_over(download_file_obj): download_media_json[ 'download_status'] = consts.constant_manager.DOWNLOAD_OVER if ConfigInit().get_config_by_option('save_db'): logging.debug('save media download_url %s, hash_sign %s' % (download_file_obj.download_url, download_file_obj.hash_sign)) scheduler_db_save_queue(download_media_json) return elif os.path.exists(download_file_obj.absolute_path): temp_size = os.path.getsize(download_file_obj.absolute_path) else: temp_size = 0 headers = { 'User-Agent': random.choice(consts.constant_manager.USER_AGENTS) } headers.update({'Range': 'bytes=%d-' % temp_size}) with closing( requests.get(download_file_obj.download_url, stream=True, headers=headers)) as response: with open(download_file_obj.absolute_path, download_file_obj.download_type) as file: chunk_size = 1024 progress = ProgressBar( download_file_obj.hash_sign, download_file_obj.absolute_path, total=download_file_obj.total_size, now_size=float(temp_size), last_size=float(temp_size), unit="KB", chunk_size=chunk_size, status=consts.constant_manager.DOWNLOADING) scheduler_db_save_queue(download_media_json) for data in response.iter_content(chunk_size=chunk_size): file.write(data) # if '需要暂停': # self.pause() progress.refresh(count=len(data)) progress.refresh( status=consts.constant_manager.DOWNLOAD_OVER) # 下载队列完成下载 if self.download_over(download_file_obj): download_media_json[ 'download_status'] = consts.constant_manager.DOWNLOAD_OVER if ConfigInit().get_config_by_option('save_db'): logging.debug( 'download over download_url %s, hash_sign %s' % (download_file_obj.download_url, download_file_obj.hash_sign)) scheduler_db_save_queue(download_media_json) else: logging.debug( 'not download over download_url %s, hash_sign %s' % (download_file_obj.download_url, download_file_obj.hash_sign))
sys.path.append('../') from src.tools.config_manager import ConfigInit from app.scheduler import scheduler_controller_queue from flask import Flask, request, jsonify from src.tools import logger app = Flask(__name__) # todo:高并发接口处理 @app.route('/to_controller', methods=['GET']) def to_controller(): result_dict = { "info": "to_controller", "state": "success", "url": "", } if not request.args or 'url' not in request.args: result_dict['info'] = 'no url or url is wrong,ip is %s' % request.remote_addr result_dict['state'] = 'false' scheduler_controller_queue(request.args['url']) result_dict['url'] = request.args['url'] return jsonify(result_dict) if __name__ == "__main__": logger.init_log() # 将host设置为0.0.0.0,则外网用户也可以访问到这个服务 app.run(host=ConfigInit().get_config_by_option('service_ip'), port=1080, debug=True)
def demo_mysql(): conn = ConfigInit().get_conn() aa = conn.getOne('select count(*) from download_media') pass
def download_media(self, download_file_obj): # type: (DownloadFile) -> None """ 同样下载文件的定义:如果是一样路径一样名称,一样大小,则认为同样文件 有没有下载同样的链接,存储在对方端上本地 只负责下载和续下载,不做其他逻辑处理 todo:多线程下载同一队列大文件支持,自动创建默认下载路径,字幕下载的文件URl是不同的(暂时无法续下载) :param download_file_obj:name,file_type,file_path均可以给出默认值,original_url :return: """ new_download_file_obj = download_file_obj self.pre_parse_download_obj(download_file_obj) download_media_json = { 'hash_sign': download_file_obj.hash_sign, 'total_size': download_file_obj.total_size, 'download_url': download_file_obj.download_url, 'absolute_path': download_file_obj.absolute_path, 'file_type': download_file_obj.file_type, 'download_status': consts.constant_manager.NOT_DOWNLOAD_OVER, } if exist_file(download_file_obj.absolute_path) and get_file_size( download_file_obj.absolute_path ) == download_file_obj.total_size: logging.debug('same file %s' % download_file_obj.absolute_path) download_media_json[ 'download_status'] = consts.constant_manager.DOWNLOAD_OVER if ConfigInit().get_config_by_option('save_db'): scheduler_db_save_queue(download_media_json) return elif os.path.exists(download_file_obj.absolute_path): temp_size = os.path.getsize(download_file_obj.absolute_path) else: temp_size = 0 headers = { 'User-Agent': random.choice(consts.constant_manager.USER_AGENTS) } headers.update({'Range': 'bytes=%d-' % temp_size}) with closing( requests.get(download_file_obj.download_url, stream=True, headers=headers)) as response: chunk_size = 1024 progress = ProgressBar(download_file_obj.absolute_path, total=download_file_obj.total_size, unit="KB", chunk_size=chunk_size, run_status="正在下载", fin_status="下载完成") with open(download_file_obj.absolute_path, download_file_obj.download_type) as file: # todo:通用下载器,调整此处逻辑 download_file_status_json = copy.deepcopy(download_file_status) download_file_status_json[ 'hash_sign'] = download_file_obj.hash_sign download_file_status_json[ 'total_size'] = download_file_obj.total_size logging.debug('start download file %s' % download_file_obj.download_url) scheduler_download_status_queue(download_file_status_json) for data in response.iter_content(chunk_size=chunk_size): file.write(data) progress.refresh(count=len(data)) # todo:if发现优先级高的下载队列,self.pause() download_media_json['download_status'] = 1 if ConfigInit().get_config_by_option('save_db'): scheduler_db_save_queue(download_media_json)
def get_hash_sign(file_name): save_path = ConfigInit().get_config_by_option('download_path') return hash_md5(save_path + file_name)
def select_by_hash_sign(hash_sign): # type: (str) -> long sql = 'select id from download_media where hash_sign="%s" and download_status=1' % hash_sign client = ConfigInit().get_conn() return client.getOne(sql)