Beispiel #1
0
 def __init__(self, project_path='/root/project/'):
     # 初始化Chrome
     self.driver, self.server, self.proxy = ChromeDriver().get_driver()
     self.project_path = project_path
     if not os.path.exists(project_path):
         os.makedirs(project_path)
     f = open(project_path + 'banner.json', 'w+')
     f.close()
Beispiel #2
0
class Banner:
    def __init__(self, project_path='/root/project/'):
        # 初始化Chrome
        self.driver, self.server, self.proxy = ChromeDriver().get_driver()
        self.project_path = project_path
        if not os.path.exists(project_path):
            os.makedirs(project_path)
        f = open(project_path + 'banner.json', 'w+')
        f.close()

    def get_limitation(self):
        url = 'https://www.ifvod.tv/movies'
        self.driver.get(url)
        self.proxy.new_har("datayes-1",
                           options={
                               'captureHeaders': True,
                               'captureContent': True
                           })
        result = self.proxy.har
        time_start = time.time()
        while time.time() - time_start < 60:
            if 'log' in result is None or 'entries' in result['log']:
                result = self.proxy.har
            for entry in result['log']['entries']:
                if 'request' in entry and 'url' in entry['request']:
                    _url = entry['request']['url']
                    time_end = time.time()
                    if "/api/home/getflashbanner" in _url:
                        r = requests.get(_url)
                        m = r.json()
                        info = m['data']['info']
                        if os.path.exists(self.project_path + 'banner_image'):
                            shutil.rmtree(self.project_path + 'banner_image')
                        result = list()
                        for i in info:
                            image_name = str(uuid.uuid1())
                            ImageSaver().save_image(
                                'https:' + i['img'],
                                self.project_path + 'banner_image/',
                                image_name + '.png')
                            dic = dict()
                            dic['id'] = i['url'].split('=')[1]
                            dic['title'] = i['title']
                            dic['image'] = image_name + '.png'
                            result.append(dic)
                        with open(self.project_path + 'banner.json',
                                  'w+') as f:
                            f.write(json.dumps(result, ensure_ascii=False))
                        print(_url, time_end - time_start)
                        return True

        self.driver.quit()
        self.server.stop()
 def __init__(self, project_path='/root/project/'):
     # 初始化Chrome
     self.driver, self.server, self.proxy = ChromeDriver().get_driver()
     self.project_path = project_path
     if not os.path.exists(project_path):
         os.makedirs(project_path)
     # self.driver.set_page_load_timeout(10)
     f = open(project_path + 'movie_detail.json', 'w+')
     f.close()
     file_dir = project_path + 'images/'
     if not os.path.exists(file_dir):
         os.makedirs(file_dir)
     else:
         shutil.rmtree(file_dir)
         os.makedirs(file_dir)
Beispiel #4
0
 def start_crawl(self):
     limitation = self.get_limitation()
     for i in range(1, limitation):
         flag = self.get_movie_list(i)
         while not flag:
             print("quit chrome")
             self.driver.quit()
             self.server.stop()
             time.sleep(5)
             print("reopen chrome ")
             self.driver, self.server, self.proxy = ChromeDriver().get_driver()
             flag = self.get_movie_list(i)
     self.driver.quit()
     self.server.stop()
     time.sleep(5)
     self.rewrite_result()
 def start_crawl(self):
     url_list = self.load_file()
     count = 1
     time_start = time.time()
     for i in url_list:
         flag = self.get_movie_detail(i)
         while not flag:
             print("quit chrome", '=' * 50)
             self.driver.quit()
             print("driver.quit", '=' * 50)
             self.server.stop()
             print("server.stop", '=' * 50)
             time.sleep(10)
             print("reopen chrome ", '=' * 50)
             self.driver, self.server, self.proxy = ChromeDriver(
             ).get_driver()
             flag = self.get_movie_detail(i)
         print(count, i)
         count += 1
     self.save_as_json()
     self.driver.quit()
     self.server.stop()
     print('程序运行时间:', time.time() - time_start, '=' * 40)
     self.send_file(self.project_path)
Beispiel #6
0
async def fetch_info(session, url):
    driver = ChromeDriver()
    await driver.process(url)

    video_xpath = '//*[@id="player"]/div[21]/video/source'
    show = EC.presence_of_element_located((By.XPATH, video_xpath))
    driver.wait.until(show)
    video_url = driver.driver.find_element_by_xpath(video_xpath).get_attribute('src')

    html = driver.driver.page_source
    info = re.findall('var flashvars =(.*?),\n', html)
    info_json = json.loads(info[0])

    duration = info_json.get('video_duration')
    title = info_json.get('video_title')
    image_url = info_json.get('image_url')
    link_url = info_json.get('link_url')
    quality_480p = info_json.get('quality_480p')

    parse_result = urlparse(video_url)
    file_path = parse_result.path
    await download_file(session, video_url, "./tmp/" + file_path)
class MovieDetail:
    def __init__(self, project_path='/root/project/'):
        # 初始化Chrome
        self.driver, self.server, self.proxy = ChromeDriver().get_driver()
        self.project_path = project_path
        if not os.path.exists(project_path):
            os.makedirs(project_path)
        # self.driver.set_page_load_timeout(10)
        f = open(project_path + 'movie_detail.json', 'w+')
        f.close()
        file_dir = project_path + 'images/'
        if not os.path.exists(file_dir):
            os.makedirs(file_dir)
        else:
            shutil.rmtree(file_dir)
            os.makedirs(file_dir)

    def start_crawl(self):
        url_list = self.load_file()
        count = 1
        time_start = time.time()
        for i in url_list:
            flag = self.get_movie_detail(i)
            while not flag:
                print("quit chrome", '=' * 50)
                self.driver.quit()
                print("driver.quit", '=' * 50)
                self.server.stop()
                print("server.stop", '=' * 50)
                time.sleep(10)
                print("reopen chrome ", '=' * 50)
                self.driver, self.server, self.proxy = ChromeDriver(
                ).get_driver()
                flag = self.get_movie_detail(i)
            print(count, i)
            count += 1
        self.save_as_json()
        self.driver.quit()
        self.server.stop()
        print('程序运行时间:', time.time() - time_start, '=' * 40)
        self.send_file(self.project_path)

    def load_file(self):
        url_list = list()
        with open(self.project_path + "url.json", "r") as file:
            url_list.extend(json.loads(file.read()))
        return url_list

    def get_movie_detail(self, url, file_name='movie_detail.json'):
        self.driver.get('https://www.ifvod.tv/detaili?id=' + url)
        self.proxy.new_har('datayes',
                           options={
                               'captureHeaders': True,
                               'captureContent': True
                           })
        result = self.proxy.har
        time_start = time.time()
        flag = False
        while time.time() - time_start < 30:
            if 'log' in result is None or 'entries' in result['log']:
                result = self.proxy.har
            for entry in result['log']['entries']:
                if 'request' in entry and 'url' in entry['request']:
                    _url = entry['request']['url']
                    if "api/video/detail" in _url:
                        r = None
                        count = 0
                        while r is None and count < 3:
                            r = requests.get(_url)
                            count += 1
                        if r is None:
                            return True
                        flag = True
                        res = self.get_movie_info(r.json())
                        with open(self.project_path + file_name, 'a+') as file:
                            file.write(
                                json.dumps(res, ensure_ascii=False) + '\n')
                            print(_url)
                        time_end = time.time()
                        print(time_end - time_start)

            if flag:
                break
            result = self.proxy.har
        return flag

    def get_movie_info(self, json_dic):
        info = json_dic['data']['info'][0]
        result = dict()
        result["duonaoId"] = info['key']
        result["language"] = info['vl']['lang']
        result["publishYear"] = info['post_Year']
        result["brief"] = info['contxt']
        result["review"] = info['commentNumber']
        result["addDate"] = info['add_date']
        result["unlike"] = info['vl']['dc']
        result["region"] = info['vl']['regional']
        result["hotRank"] = info['vl']['hot']
        result["actor"] = info['vl']['starring']
        result["channel"] = info['channel']
        result["name"] = info['vl']['title']
        result["director"] = info['vl']['director']
        result["interest"] = info['vl']['dd']
        result["category"] = info['videoType']
        result["rate"] = info['pinfenRate']
        image_name = str(uuid.uuid1())
        result['image'] = image_name
        ImageSaver().save_image('https:' + info['imgPath'],
                                self.project_path + 'images/',
                                image_name + '.jpeg')
        return result

    def save_as_json(self):
        l = list()
        with open(self.project_path + 'movie_detail.json', 'r') as file:
            text_lines = file.readlines()
            for line in text_lines:
                l.append(json.loads(line))
        with open(self.project_path + 'movie_detail.json', 'w+') as f:
            f.write(json.dumps(l, ensure_ascii=False))

    def send_file(self, file_name):
        client = paramiko.SSHClient()  # 获取SSHClient实例
        client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        client.connect(hostname="122.51.155.8",
                       username="******",
                       password="******",
                       port=22)  # 连接SSH服务端
        transport = client.get_transport()  # 获取Transport实例

        # 创建sftp对象,SFTPClient是定义怎么传输文件、怎么交互文件
        sftp = paramiko.SFTPClient.from_transport(transport)
        file_name = file_name[:-1]
        for root, dirs, files in os.walk(file_name):
            path = '/home/ubuntu' + root.replace(file_name, '') + '/'
            if path == '/home/ubuntu//':
                path = '/home/ubuntu/'
            print('当前目录路径', path)  # 当前目录路径
            print('当前路径下所有非目录子文件', files)  # 当前路径下所有非目录子文件
            for i in files:
                print(path + i)
                print(root + '/' + i)
                sftp.put(root + '/' + i, path + i)
        sftp.close()
        # 关闭连接
        client.close()
Beispiel #8
0
class MovieList:
    def __init__(self, project_path='/root/project/'):
        # 初始化Chrome
        self.driver, self.server, self.proxy = ChromeDriver().get_driver()
        self.project_path = project_path
        if not os.path.exists(project_path):
            os.makedirs(project_path)
        f = open(project_path + 'url.json', 'w+')
        f.close()

    def start_crawl(self):
        limitation = self.get_limitation()
        for i in range(1, limitation):
            flag = self.get_movie_list(i)
            while not flag:
                print("quit chrome")
                self.driver.quit()
                self.server.stop()
                time.sleep(5)
                print("reopen chrome ")
                self.driver, self.server, self.proxy = ChromeDriver().get_driver()
                flag = self.get_movie_list(i)
        self.driver.quit()
        self.server.stop()
        time.sleep(5)
        self.rewrite_result()

    def rewrite_result(self):
        local_id = self.load_file()
        final_list = list()
        for i in local_id:
            final_list.append(i)
        with open(self.project_path + "url.json", "w+") as file:
            file.write(json.dumps(final_list, ensure_ascii=False))

    def load_file(self):
        url_list = list()
        with open(self.project_path + "url.json", "r") as file:
            text_lines = file.readlines()
            for line in text_lines:
                url_list.extend(json.loads(line))
        return url_list

    def get_movie_list(self, page):
        url = 'https://www.ifvod.tv/list?keyword=&star=&page={0}&pageSize=30&cid=0,1,3&year=-1&language=-1&region=-1&status=-1&orderBy=2&desc=true'.format(
            page)
        self.driver.get(url)
        self.proxy.new_har("datayes", options={'captureHeaders': True, 'captureContent': True})
        result = self.proxy.har
        time_start = time.time()
        flag = False
        while time.time() - time_start < 10:
            if 'log' in result is None or 'entries' in result['log']:
                result = self.proxy.har
            for entry in result['log']['entries']:
                if 'request' in entry and 'url' in entry['request']:
                    _url = entry['request']['url']
                    if "api/list/Search" in _url:
                        r = requests.get(_url)
                        flag = True
                        time_end = time.time()
                        with open(self.project_path + 'url.json', 'a+') as file:
                            file.write(json.dumps([i['key'] for i in r.json()['data']['info'][0]['result']],
                                                  ensure_ascii=False) + '\n')
                            print(_url, time_end - time_start)
            if flag:
                break
            result = self.proxy.har
        return flag

    def get_limitation(self):
        url = 'https://www.ifvod.tv/list?keyword=&star=&page={0}&pageSize=30&cid=0,1,3&year=-1&language=-1&region=-1&status=-1&orderBy=2&desc=true'.format(
            1)
        self.driver.get(url)
        self.proxy.new_har("datayes-1", options={'captureHeaders': True, 'captureContent': True})
        result = self.proxy.har
        time_start = time.time()
        while time.time() - time_start < 60:
            if 'log' in result is None or 'entries' in result['log']:
                result = self.proxy.har
            for entry in result['log']['entries']:
                if 'request' in entry and 'url' in entry['request']:
                    _url = entry['request']['url']
                    if "api/list/Search" in _url:
                        r = requests.get(_url)
                        print(_url)
                        return math.ceil(int(r.json()['data']['info'][0]['recordcount']) / 30)