Esempio n. 1
0
    def get_song_detail(self, id):
        """
        get song detail form playlist
        """

        host = 'http://music.163.com/api/playlist/detail?id=' + str(id)
        json = proxy_req(host, 1)
        if json == 0:
            if can_retry(host):
                self.get_song_detail(id)
            return []
        result = json['result']
        tracks = result['tracks']

        if len(tracks) <= 1:
            if can_retry(host):
                self.get_song_detail(id)
            return []
        else:
            playcount = result['playCount']
            for track in tracks:
                songid = track['id']
                songname = track['name']
                self.songlist.append([songid, songname, playcount])
            self.finishlist.append(id)
Esempio n. 2
0
 def summarization_once(self, index):
     """
     get html from news
     """
     print(index)
     texts = []
     if index:
         url = 'https://www.baidu.com/s?ie=utf-8&mod=1&isbd=1&isid=919fab3c0002c9f1&wd=%E5%81%B7%E7%8B%97&pn=730&oq=%E5%81%B7%E7%8B%97&tn=baiduhome_pg&ie=utf-8&rsv_idx=2&rsv_pq=919fab3c0002c9f1&rsv_t=7e30ggF%2BMa91oOURk1bMtN8af5unSwOR08TodNBB%2F%2B6B6RBEwUi8l8IAe28ACA%2B8b5I5&gpc=stf%3D1517038564%2C1548574564%7Cstftype%3D1&tfflag=1&bs=%E5%81%B7%E7%8B%97&rsv_sid=undefined&_ss=1&clist=28bc21fb856a58b7%09350102124f079888%0928bc21fb856a58b7%0928bc2159845c1cf3%0928bc2015823fa56b%0928a121fb84a7d1a6&hsug=&f4s=1&csor=2&_cr1=34767&pn=' + \
             str(index * 20)
     else:
         url = 'http://news.baidu.com/ns?rn=20&ie=utf-8&cl=2&ct=1&bs=%E6%AF%92%E7%8B%97%E8%82%89&rsv_bp=1&sr=0&f=8&prevct=no&tn=news&word=%E5%81%B7%E7%8B%97'
     news_lists = proxy_req(url, 0)
     if not news_lists:
         if can_retry(url):
             self.summarization_once(index)
         return
     summarization_lists = news_lists.find_all('div', class_='result')
     if not len(summarization_lists):
         if can_retry(url):
             self.summarization_once(index)
         return
     print('num: ', len(summarization_lists), url)
     for summarization in summarization_lists:
         temp_text = summarization.text.replace('\n', '').replace(
             '\xa0', '').replace('\t', '').strip()
         temp_text = ' '.join(temp_text.split())
         texts.append(temp_text[:-8])
     self.summarizations[int(index)] = texts
Esempio n. 3
0
 def load_av_lists(self):
     url = self.MEMBER_SUBMIT_URL % self.assign_up_mid
     json_req = basic_req(url, 1)
     if json_req is None or not 'data' in json_req or not 'vlist' in json_req[
             'data']:
         if can_retry(url):
             self.load_av_lists()
         return
     av_id_map = {ii['aid']: ii for ii in json_req['data']['vlist']}
     if self.basic_av_id not in av_id_map:
         if can_retry(url):
             self.load_av_lists()
         return
     self.av_id_map = av_id_map
Esempio n. 4
0
 def _getroom_id(self, next_to=True, proxy=True):
     ''' get av room id '''
     url = self.ROOM_INIT_URL % self._av_id
     html = proxy_req(url, 0) if proxy else basic_req(url, 0)
     head = html.find_all('head')
     if not len(head) or len(
             head[0].find_all('script')) < 4 or not '{' in head[0].find_all(
                 'script')[3].text:
         if can_retry(url):
             self._getroom_id(proxy=proxy)
         else:
             self._getroom_id(proxy=False)
         next_to = False
     if next_to:
         script_list = head[0].find_all('script')[3].text
         script_begin = script_list.index('{')
         script_end = script_list.index(';')
         script_data = script_list[script_begin:script_end]
         json_data = json.loads(script_data)
         if self._p == -1 or len(json_data['videoData']['pages']) < self._p:
             self._room_id = json_data['videoData']['cid']
         else:
             self._room_id = json_data['videoData']['pages'][self._p -
                                                             1]['cid']
         print('Room_id:', self._room_id)
Esempio n. 5
0
    def get_goods_id_first(self, origin_url, index):
        """
        get goods id first
        """

        origin_url = origin_url.replace('https', 'http')
        # first_result = proxy_req(origin_url, 0)
        first_result = basic_req(origin_url, 0, header=self.headers)

        if not first_result or len(first_result.find_all('script')) < 2:
            if can_retry(origin_url):
                self.get_goods_id_first(origin_url, index)
            return

        wait = first_result.find_all('script')[1].text
        if not '"title":"' in wait:
            return
        title = re.findall('"title":".*","',
                           wait)[0].split('","')[0].split('":"')[1]
        if title in self.title2map:
            self.goods_map[index] = self.title2map[title]
            self.url2goods[origin_url] = self.title2map[title]

            print(self.title2map[title])
        else:
            print(title)
Esempio n. 6
0
 def search_goods_once(self, goods_name, index):
     if not os.path.exists('%scookie_alimama' % data_dir):
         print('alimama cookie not exist!!!')
         return
     with codecs.open('%scookie_alimama' % data_dir, 'r',
                      encoding='utf-8') as f:
         cookie = f.readlines()
     url_list = [
         'https://pub.alimama.com/items/search.json?auctionTag=&perPageSize=50&shopTag=&_tb_token_=',
         cookie[1][:-1], '&pvid=', cookie[2][:-1], '&t=',
         str(int(round(time.time() * 1000))), '&_t=',
         str(int(round(time.time() * 1000))), '&q=', goods_name
     ]
     headers = {
         'X-Requested-With': 'XMLHttpRequest',
         'Cookie': '',
         'Content-Type': get_content_type(),
         'Accept': get_accept('xhr'),
     }
     headers['Cookie'] = cookie[0][:-1]
     ca = basic_req(''.join(url_list), 2, header=headers)
     if ca.status_code != 200 or not 'data' in ca.json():
         if can_retry(''.join(url_list)):
             self.search_goods_once(goods_name, index)
         return
     page_list = ca.json()['data']['pageList']
     title = [
         '||'.join(
             [str(index['auctionId']), goods_name,
              str(index['zkPrice'])]) for index in page_list
     ][0]
     self.goods_name[index] = title
     print(title)
Esempio n. 7
0
def load_index():
    ''' load index '''
    global movie_list
    version = begin_time()
    text = proxy_req(HOMEPAGE_URL, 3)
    if not len(text):
        if can_retry(HOMEPAGE_URL):
            load_index()
        return
    movie_list = re.findall('《(.*?)》', text)
    movie_more = re.findall('href="(.*?)">更多', text)
    for uri in movie_more:
        load_other(uri)

    threading_list = [threading.Thread(
        target=load_other, args=(ii,)) for ii in movie_another]
    shuffle_batch_run_thread(threading_list, 100)
    threading_list = [threading.Thread(
        target=load_other, args=(ii,)) for ii in movie_again]
    shuffle_batch_run_thread(threading_list, 100)
    # 对电影列表去重
    movie_list = set(movie_list)
    # 导出爬取的 电影列表
    out_path = 'dytt8_result.txt'
    with open(out_path, 'w') as f:
        f.write('\n'.join(movie_list))
    url_num = len([*movie_more, *movie_another]) + 1
    movie_num = len(movie_list)
    echo(1, 'Requests num: {}\nMovie num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format(
            url_num, movie_num, out_path, end_time(version, 0)))
Esempio n. 8
0
    def load_spot_once(self, pn=1, city_id=10186):
        ''' load spot once '''
        data = {
            'sAct': 'KMdd_StructWebAjax|GetPoisByTag',
            'iMddid': city_id,
            'iTagId': 0,
            'iPage': pn,
        }
        data = self.load_sn(data)
        print(data)
        req = proxy_req(self.AJAX_ROUTER_URL, 11, data=data)
        if req is None or not 'data' in req or not 'list' in req['data']:
            if can_retry('{}{}{}'.format(self.AJAX_ROUTER_URL, city_id, pn)):
                self.load_spot_once(pn, city_id)
            return
        spot_list = req['data']['list']
        spot_pn = req['data']['page']
        spot_tmp = re.findall('<h3>.*?(.*?)</h3>', spot_list)
        try:
            total_pn = int(re.findall('共<span>(.*?)</span>', spot_pn)[0])
        except Exception as e:
            total_pn = 1
            echo(0, 'City id:', city_id, 'Page:', pn, spot_pn, e)

        if city_id not in self.spot_result:
            self.spot_result[city_id] = spot_tmp
        else:
            self.spot_result[city_id] += spot_tmp
        self.spot_pn[city_id] = total_pn
Esempio n. 9
0
 def get_movie_list_from_tabs(self, sorts: str, tags: str, genres: str, year_range: str, star: int = 0):
     ''' get info from movie list '''
     params_dict = {'sort': sorts, 'range': '0,10', 'tags': urllib.parse.quote(tags),
                    'genres': urllib.parse.quote(genres), 'star': star, 'limit': 1000 if star < 9000 else 9999 - star, 'year_range': year_range}
     params = ['{}={}'.format(ii, jj)
               for ii, jj in params_dict.items() if jj != '']
     url = '{}{}'.format(self.NEW_SEARCH_SUBJECT_URL, '&'.join(params))
     self.generate_cookie()
     movie_req = proxy_req(url, 2)
     if movie_req is None:
         if can_retry(url):
             self.get_movie_list_from_tabs(
                 sorts, tags, genres, year_range, star)
         else:
             self.again_list.append([sorts, tags, genres, year_range, star])
             echo(0, url, 'Failed')
         return
     if movie_req.status_code != 200:
         return
     try:
         movie_json = movie_req.json()
         echo(2, url, 'loaded')
         id2name = {int(ii['id']): ii['title'] for ii in movie_json['data']}
         self.movie_id2name = {**self.movie_id2name, **id2name}
     except:
         echo(0, url, 'Except!')
Esempio n. 10
0
 def get_search_list(self, q: str):
     if self.proxy_can_use:
         base_url = self.API_PROXY_URL if random.random() * 10 > 7 else self.API_BASIC_URL
     else:
         base_url = self.API_BASIC_URL
     url = '{}search?q={}&count=66'.format(base_url, urllib.parse.quote(q))
     search_json = proxy_req(url, 1)
     if search_json is None or not 'subjects' in search_json:
         if search_json and 'code' in search_json:
             if search_json['code'] == 112:
                 self.proxy_can_use = False
         if can_retry(url, 6):
             time.sleep(random.random() *
                        (3.14 + random.randint(4, 10)) + 3.14)
             self.get_search_list(q)
         else:
             self.again_list.append(q)
             echo(0, url, 'Failed')
         return
     # echo(2, url, 'loaded')
     id2name = {int(ii['id']): ii['title']
                for ii in search_json['subjects']}
     self.movie_id2name = {**self.movie_id2name, **id2name}
     self.finish_list.append(q)
     if not len(self.finish_list) % 600:
         echo(2, len(self.finish_list), 'Finish...')
         dump_bigger(self.movie_id2name,
                     '{}douban_movie_id.pkl'.format(data_dir))
Esempio n. 11
0
    def load_url(self):
        ''' load url form zimuzu '''

        url = 'http://zmz005.com/{}'.format(self.zimuzu_id)
        detail = proxy_req(url, 0)
        total = []

        if not detail:
            print('retry')
            if can_retry(url):
                self.load_url()
            return
        season_list = detail.find_all(
            'div', class_='tab-content info-content')[1:]
        for season in season_list:
            quality_list = season.find_all('div', class_='tab-pane')
            url_body = quality_list[1] if 'APP' in quality_list[0]['id'] else quality_list[0]
            season_id = re.findall(r"\d+\.?\d*", url_body['id'])[0]
            total.append(season_id)
            if int(season_id) < 12:
                url_body = quality_list[1]

            url_list = url_body.find_all('ul', class_='down-links')
            url = [index.find_all('div', class_='copy-link')[1]['data-url']
                   for index in url_list]
            total.append('\n'.join(url) + '\n')
        with codecs.open('{}{}'.format(data_dir, self.drama_name), 'w', encoding='utf-8') as f:
            f.write('\n'.join(total))
Esempio n. 12
0
    def summarization_once(self, index):
        """
        get html from news
        """
        print(index)
        texts = []
        hrefs = []
        if index:
            url = 'https://www.google.com.hk/search?q=%E5%81%B7%E7%8B%97&newwindow=1&safe=strict&tbm=nws&ei=PcVKXJKRIc7s8AXB05e4Dw&sa=N&ved=0ahUKEwjSo5nBvojgAhVONrwKHcHpBfcQ8tMDCFE&biw=1627&bih=427&dpr=2&start=' + \
                str(index * 10)
        else:
            url = 'https://www.google.com.hk/search?q=%E5%81%B7%E7%8B%97&newwindow=1&safe=strict&tbm=nws&ei=O8VKXJ7nFoP_8QX1oK_gDA&start=0&sa=N&ved=0ahUKEwje8JTAvojgAhWDf7wKHXXQC8w4ChDy0wMISQ&biw=1627&bih=427&dpr=2'
        news_lists = basic_req(url, 0)
        href_lists = news_lists.find_all('a', class_=['RTNUJf', 'l lLrAF'])
        summarization_lists = news_lists.find_all('div', class_='gG0TJc')

        if not len(href_lists) or not len(summarization_lists):
            if can_retry(url):
                self.summarization_once(index)
            return
        print('num: ', len(summarization_lists), url)
        for href in href_lists:
            hrefs.append(href['href'])
        for summarization in summarization_lists:
            temp_text = summarization.text.replace('\n', '').replace(
                '\xa0', '').replace('\t', '').replace('...', '').strip()
            temp_text = ' '.join(temp_text.split())
            texts.append(temp_text)
        self.summarizations[int(index)] = texts
        self.hrefs[int(index)] = hrefs
Esempio n. 13
0
 def get_web_content(self):
     req = proxy_req(self.WEB_URL, 3, header=self.get_ynote_web_header())
     if len(req) < 1000:
         if can_retry(self.WEB_URL):
             return self.get_web_content()
         else:
             return
     return req
Esempio n. 14
0
 def get_a_m_basic(self, a_m_url: str):
     headers = self.get_tb_headers(a_m_url)
     req = proxy_req(a_m_url, 2, header=headers, config={"allow_redirects": False})
     if req is None or "location" not in req.headers:
         if can_retry(a_m_url):
             return self.get_a_m_basic(a_m_url)
         return
     return req
Esempio n. 15
0
 def get_api_req(self, url: str, av_id: int):
     req = self.proxy_req(url, 1, header=self.get_api_headers(av_id))
     if req is None or list(req.keys()) != self.JSON_KEYS:
         if can_retry(url):
             return self.get_api_req(url, av_id)
         else:
             return
     return req["data"]
Esempio n. 16
0
 def get_s_click_detail(self, redirect_url: str, tu_url: str):
     headers = self.get_tb_headers(refer_url=tu_url)
     req = proxy_req(redirect_url, 2, header=headers)
     if req is None or "id=" not in req.url:
         if can_retry(redirect_url):
             return self.get_s_click_detail(redirect_url, tu_url)
         else:
             return
     return self.get_item_detail(req.url)
Esempio n. 17
0
    def get_request(self, url, types):

        result = basic_req(url, 1)

        if not result:
            if can_retry(url):
                self.get_request(url, types)
            return
        return result
Esempio n. 18
0
    def get_request_v2(self, url, types, header):

        result = proxy_req(url, 0, header=header)

        if not result or not len(result.find_all('div', class_='content')):
            if can_retry(url):
                self.get_request_v2(url, types, header)
            return
        return result
Esempio n. 19
0
    def get_request_v3(self, url, types):

        result = basic_req(url, 0)

        if result is None or not result or not len(result.find_all('p', class_='content')):
            if can_retry(url):
                self.get_request_v3(url, types)
            return
        return result
Esempio n. 20
0
 def share_article(self, article_id: str):
     p = self.share2article[article_id][-2].split("/")[-1]
     url = self.MYSHARE_URL % (p, self.cstk)
     req = proxy_req(url, 1, header=self.get_ynote_web_header(1))
     if req is None or list(req.keys()) != ["entry", "meta"]:
         if can_retry(url):
             return self.share_article(article_id)
         return False
     echo("2", "Share article {} Success!!!".format(article_id))
     return True
Esempio n. 21
0
 def get_captcha(self, cookie: dict = {}):
     url = self.CAPTCHA_URL
     headers = self.get_login_headers(0, cookie)
     captcha, cookies = proxy_req(url, 1, header=headers, need_cookie=True)
     if captcha is None or list(captcha.keys()) != ['code', 'data']:
         if can_retry(url):
             return self.get_captcha()
         else:
             return None, {}
     return captcha['data']['result'], cookies
Esempio n. 22
0
 def get_item_basic(self, item_id: int, url: str = ""):
     url = self.ITEM_URL % item_id if url == "" else url
     headers = {"Accept": get_accept("html")}
     req = proxy_req(url, 2, header=headers, config={"allow_redirects": False})
     if req is None:
         if can_retry(url):
             return self.get_item_basic(item_id, url)
         return
     if req.status_code != 200:
         return self.get_item_basic(item_id, req.headers["Location"])
     return req
Esempio n. 23
0
    def get_request(self, url: str, types: int, functs, header: dict = {}):
        if len(header):
            req = basic_req(url, types, header=header)
        else:
            req = basic_req(url, types)

        if functs(req):
            if can_retry(url):
                self.get_request(url, types, functs, header)
            return
        return req
Esempio n. 24
0
 def href_once(self, index):
     """
     get html from news
     """
     print(index)
     texts = []
     url = 'https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=毒狗肉&pn=' + \
         str(index * 10)
     news_lists = proxy_req(url, 0)
     if not news_lists:
         if can_retry(url):
             self.href_once(index)
         return
     test = news_lists.find_all('div', class_='result')
     if not len(test):
         if can_retry(url):
             self.href_once(index)
         return
     href_list = [index.a['href'] for index in test]
     self.href_map[int(index)] = href_list
Esempio n. 25
0
 def get_cid(self, av_id: int):
     playlist_url = self.PLAYLIST_URL % av_id
     headers = {'Accept': '*/*', 'Referer': self.ROOM_INIT_URL % av_id}
     req = proxy_req(playlist_url, 1, header=headers)
     if req is None or list(req.keys()) != self.JSON_KEYS:
         if can_retry(playlist_url):
             return self.get_cid(av_id)
         else:
             return
     cid = [ii['cid'] for ii in req['data']]
     return cid
Esempio n. 26
0
    def check_type_req(self, av_id: int):
        changeHeaders({'Referer': self.BASIC_AV_URL % av_id})
        url = self.VIEW_URL % av_id

        json_req = proxy_req(url, 1)

        if json_req is None or 'data' not in json_req or 'tid' not in json_req[
                'data']:
            if can_retry(url):
                self.check_type_req(av_id)
            return
        self.rank_type[av_id] = json_req['data']['tid'] == self.assign_tid
Esempio n. 27
0
    def load_comment_v1(self, movie_id: int, start: int):
        ''' load comment '''
        url = self.COMMENT_URL % (movie_id, start)
        self.generate_cookie()
        comment_json = proxy_req(url, 1)
        if comment_json is None or not 'html' in comment_json:
            if can_retry(url):
                time.sleep(random.random() * random.randint(0, 4))
                self.load_user_id(movie_id, start)
            else:
                self.again_list.append([movie_id, start])
                echo(0, url, 'Failed')
            return
        comment_html = comment_json['html']
        # comment_bs4 = BeautifulSoup(comment_html, 'html.parser')
        # comment = {}
        # for ii in comment_bs4.find_all('div', class_='comment-item'):
        #     user_id = ii.a['href'].split('/')[-2]
        #     user_name = ii.a['title']
        #     votes = ii.find_all('span', class_='votes')
        #     votes = votes[0].text if len(votes) else ''
        #     comment_time = ii.find_all(
        #         'span', class_='comment-time')[0]['title']
        #     rate = ii.find_all('span', class_='rating')
        #     rate = rate[0]['class'][0].split('allstar')[1] if len(rate) else ''
        #     short = ii.find_all('span', class_='short')
        #     short = short[0] if len(short) else ''
        #     comment[user_id] = [user_name, user_id,
        #                         comment_time, short, votes, rate]
        # user_list = set(comment)

        user_list = re.findall(
            r'title="(.*?)" href="https://www.douban.com/people/([\s\S]{1,30}?)/"\>', comment_html)

        if not len(user_list):
            self.finish_list[(movie_id, start)] = 0
            self.checkpoint()
            return
        votes = re.findall(r'votes"\>(\w{1,7}?)<', comment_html)
        comment_time = re.findall(r'-time " title="(.*?)">\n', comment_html)
        short = re.findall(r'class="short">([\s\S]*?)</span>', comment_html)
        rate = re.findall('allstar(\w{1,2}?) rat', comment_html)
        if len(user_list) != len(comment_time) or len(user_list) != len(short):
            echo(0, url, 'Comment reg error!!!')
        comment = {jj[1]: [jj[0], jj[1], comment_time[ii], short[ii] if ii < len(short) else '', votes[ii] if ii < len(votes) else '', rate[ii] if ii < len(rate) else '']
                   for ii, jj in enumerate(user_list)}
        user_list = {ii[1] for ii in user_list}
        self.user_info = {*self.user_info, *user_list}
        self.comment[movie_id] = {**self.comment[movie_id], **comment}
        if len(user_list) == 20 and (not (start + 20) % 100 or start < 100):
            self.more_user.append([movie_id, start + 20])
        self.finish_list[(movie_id, start)] = 0
        self.checkpoint()
Esempio n. 28
0
 def get_api_req(self, url: str, bv_id: str, types: int = 0):
     if types == 0:
         req = self.proxy_req(url, 1, header=self.get_api_headers(bv_id))
     else:
         req = self.proxy_req(url, 3, header=self.get_api_headers(bv_id))
         req = self.decoder_jp(req)
     if req is None or list(req.keys()) != self.JSON_KEYS:
         if can_retry(url):
             return self.get_api_req(url, bv_id, types)
         else:
             return
     return req["data"]
Esempio n. 29
0
 def request_text(self, url):
     ''' requests text '''
     req = basic_req(url, 2)
     if req is None:
         echo(0, url)
         if can_retry(url):
             self.request_text(url)
         else:
             return ''
     else:
         echo(1, url)
         return req.text
Esempio n. 30
0
 def get_danmaku_once(self, oid: int):
     dm_url = self.DM_URL % oid
     req = proxy_req(dm_url, 2)
     if req is None:
         if can_retry(dm_url):
             return self.get_danmaku_once(oid)
         else:
             return
     req.encoding = "utf-8"
     dm = regex.findall('p="(.*?)">(.*?)</d>', req.text)
     echo(3, "oid {} have {} dm".format(oid, len(dm)))
     return dm, oid