Python basic_reqの例、utils.utils.basic_req Pythonの例

コード例 #1

0

ファイルを表示

ファイル: getproxy.py プロジェクト: 123liwei/spider

    def judgeurl(self, urls, index, times):
        """
        use /api/playlist to judge http; use /discover/playlist judge https
        1. don't timeout = 5
        2. response.result.tracks.size() != 1
        """

        http_type = urls[4] == 's'
        proxies = {type_map[http_type]: urls}

        test_url = type_map[
            http_type] + '://music.163.com/api/playlist/detail?id=432853362'
        ss_url = 'https://www.google.com/?gws_rd=ssl'
        try:
            # print(test_url, proxies)
            # return
            data = basic_req(test_url, 1, proxies)
            result = data['result']
            tracks = result['tracks']
            if len(tracks) == 56:
                if times < 2:
                    self.judgeurl(urls, index, times + 1)
                else:
                    self.canuseip[index] = [urls, int(http_type)]
                    data = basic_req(ss_url, 0)
                    if len(str(data)) > 5000:
                        self.canuseip[index] = [urls, int(http_type) + 2]
            else:
                self.cannotuseip[index] = urls
        except:
            if not index in self.canuseip:
                self.cannotuseip[index] = urls
            pass

コード例 #2

0

ファイルを表示

    def judgeurl(self, urls, index, times, ss_test=False):
        """
        use /api/playlist to judge http; use /discover/playlist judge https
        1. don't timeout = 5
        2. response.result.tracks.size() != 1
        """

        http_type = urls[4] == 's'
        proxies = {type_map[http_type]: urls}

        test_url = type_map[http_type] + '://music.163.com/api/playlist/detail?id=432853362'
        ss_url = 'https://www.google.com/?gws_rd=ssl'
        try:
            data = basic_req(test_url, 1, proxies)
            result = data['result']
            tracks = result['tracks']
            if len(tracks) == 56:
                if times < 0:
                    self.judgeurl(urls, index, times + 1)
                else:
                    echo(1, urls, proxies, 'Proxies can use.')
                    self.canuse_proxies.append(urls)
                    self.canuseip[index] = [urls, int(http_type)]
                    if ss_test:
                        data = basic_req(ss_url, 0)
                        if len(str(data)) > 5000:
                            self.canuseip[index] = [urls, int(http_type) + 2]
            else:
                echo(0, urls, proxies, 'Tracks len error ^--<^>--^ ')
                self.cannotuseip[index] = urls
        except:
            echo(0, urls, proxies, 'return error [][][][][][]')
            if not index in self.canuseip:
                self.cannotuseip[index] = urls
            pass

コード例 #3

0

ファイルを表示

    def prepare_js(self):
        ''' prepare js '''
        pre_text = basic_req(self.JD_URL, 3)
        INDEX_JS_URL = re.findall(r'src=.*index\.js.*" t',
                                  pre_text)[0].split('"')[1]
        origin_js = basic_req(INDEX_JS_URL, 3)
        ''' decoder js '''
        decode_js = codecs.unicode_escape_decode(origin_js)[0]
        ''' params replace '''
        replace_list_str = decode_js.split(';')[2]
        empty_index = replace_list_str.index(' ') + 1
        begin_index = replace_list_str.index('=[') + 2
        end_index = replace_list_str.index(']')
        replace_list = replace_list_str[begin_index:end_index].split(',')
        rp = replace_list_str[empty_index:begin_index - 2]
        for ii, jj in enumerate(replace_list):
            decode_js = decode_js.replace('{}[{}]'.format(rp, ii), jj)
        self.slat = replace_list[46].replace('"', '')
        echo(2, 'salt', self.slat)
        ''' load to local '''
        with open(decoder_js_path, 'w') as f:
            f.write(';\n'.join(decode_js.split(';')))
        ''' del function about ajax '''
        del_str = re.findall(r'_.{6,10}\["ajaxPrefilter.*\)\}\}\)', decode_js)
        del_begin_index = decode_js.index(del_str[0])

        result_js = decode_js[:del_begin_index] + \
            decode_js[del_begin_index + len(del_str[0]):]

        result_js = decode_js[:del_begin_index] + \
            decode_js[del_begin_index + len(del_str[0]):]
        self.result_js = result_js
        self.js_compile = execjs.compile(open(hotel_js_path).read())
        echo(1, 'Load hotel index js success!!!')

コード例 #4

0

ファイルを表示

    def test_change_youdaoyun(self, atricle_id, body, article_name):
        """
        change youdaoyun article demo
        @param 'buildmd/data/cookie': cookie in youdaoyun web
        @param            article_id: change article No.
        @param:                 body: change article body
        @param:         article_name: change article name
        """
        url = 'https://note.youdao.com/yws/api/personal/sync?method=push&keyfrom=web&cstk=E3CF_lx8'
        headers = {
            'pragma': 'no-cache',
            'cache-control': 'no-cache',
            'Cookie': '',
            'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
            'Accept': 'application/json, text/plain, */*',
            "Accept-Encoding": "",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36",
            'Origin': 'https://note.youdao.com',
            'Referer': 'https://note.youdao.com/web'
        }
        if not os.path.exists('%scookie' % data_dir):
            print('Youdao Note cookie not exist!!!')
            return

        with codecs.open('%scookie' % data_dir, 'r', encoding='utf-8') as f:
            cookie = f.readline()
        headers['cookie'] = cookie[:-1]
        headers['Host'] = url.split('/')[2]

        file_list_url = 'https://note.youdao.com/yws/api/personal/file?method=listRecent&offset=0&limit=30&keyfrom=web&cstk=E3CF_lx8'
        file_data = {'cstk': 'E3CF_lx8'}
        ca = basic_req(file_list_url, 11, data=file_data, header=headers)
        if not len(ca):
            print('List Error')
            return
        change_data_origin = ca[atricle_id]['fileEntry']
        body_string = [
            '<?xml version="1.0"?><note xmlns="http://note.youdao.com" schema-version="1.0.3" file-version="0"><head/><body><para><coId>12-1550424181958</coId><text>',
            body, '</text><inline-styles/><styles/></para></body></note>'
        ]
        change_data = {
            'name': article_name,
            'fileId': change_data_origin['id'],
            'parentId': change_data_origin['parentId'],
            'domain': change_data_origin['domain'],
            'rootVersion': -1,
            'sessionId': '',
            'modifyTime': int(round(time.time())),
            'bodyString': "".join(body_string),
            'transactionId': change_data_origin['id'],
            'transactionTime': int(round(time.time())),
            'orgEditorType': change_data_origin['orgEditorType'],
            'tags': change_data_origin['tags'],
            'cstk': 'E3CF_lx8'
        }
        print(change_data)
        cb = basic_req(url, 12, data=change_data, header=headers)
        return cb

コード例 #5

0

ファイルを表示

    def summarization_once(self, index):
        """
        get html from news
        """
        print(index)
        texts = []
        hrefs = []
        if index:
            url = 'https://www.google.com.hk/search?q=%E5%81%B7%E7%8B%97&newwindow=1&safe=strict&tbm=nws&ei=PcVKXJKRIc7s8AXB05e4Dw&sa=N&ved=0ahUKEwjSo5nBvojgAhVONrwKHcHpBfcQ8tMDCFE&biw=1627&bih=427&dpr=2&start=' + \
                str(index * 10)
        else:
            url = 'https://www.google.com.hk/search?q=%E5%81%B7%E7%8B%97&newwindow=1&safe=strict&tbm=nws&ei=O8VKXJ7nFoP_8QX1oK_gDA&start=0&sa=N&ved=0ahUKEwje8JTAvojgAhWDf7wKHXXQC8w4ChDy0wMISQ&biw=1627&bih=427&dpr=2'
        news_lists = basic_req(url, 0)
        href_lists = news_lists.find_all('a', class_=['RTNUJf', 'l lLrAF'])
        summarization_lists = news_lists.find_all('div', class_='gG0TJc')

        if not len(href_lists) or not len(summarization_lists):
            if can_retry(url):
                self.summarization_once(index)
            return
        print('num: ', len(summarization_lists), url)
        for href in href_lists:
            hrefs.append(href['href'])
        for summarization in summarization_lists:
            temp_text = summarization.text.replace('\n', '').replace(
                '\xa0', '').replace('\t', '').replace('...', '').strip()
            temp_text = ' '.join(temp_text.split())
            texts.append(temp_text)
        self.summarizations[int(index)] = texts
        self.hrefs[int(index)] = hrefs

コード例 #6

0

ファイルを表示

    def bulk_import_alimama_once(self, index, group_id):
        """
        bulk import alimama
        """

        url = 'http://pub.alimama.com/favorites/item/batchAdd.json'
        if not os.path.exists('%scookie_alimama' % data_dir):
            print('alimama cookie not exist!!!')
            return
        with codecs.open('%scookie_alimama' % data_dir, 'r',
                         encoding='utf-8') as f:
            cookie = f.readlines()

        goods_len = len(self.goods_candidate)
        begin_id = index * 200
        end_id = min(goods_len, (index + 1) * 200)

        goods_ids = self.goods_candidate[begin_id:end_id]
        update_data = {
            'groupId': group_id,
            'itemListStr': ','.join(goods_ids),
            't': str(int(round(time.time() * 1000))),
            '_tb_token_': cookie[1][:-1],
            'pvid': cookie[2][:-1]
        }
        print(update_data)
        cb = basic_req(url, 12, data=update_data, header=headers)
        if cb.status_code == 200 and cb.json()['info']['message'] != 'nologin':
            print(cb.json()['data'])

コード例 #7

0

ファイルを表示

    def get_cookie(self):
        """
        make cookie login
        PS: Though cookie expired time is more than 1 year,
            but It will be break when the connect close.
            So you need reactive the cookie by this function.
        """
        headers = {
            'pragma': 'no-cache',
            'cache-control': 'no-cache',
            'Host': 'www.gatherproxy.com',
            'Origin': 'http://www.gatherproxy.com',
            'Referer': 'http://www.gatherproxy.com/proxylist/anonymity/?t=Transparent',
            'Cookie': '_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57',
            'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            "Accept-Encoding": "",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36",
        }
        login_url = 'http://www.gatherproxy.com/subscribe/login'

        cookie_html = basic_req(login_url, 0,header=headers)
        verify_text = cookie_html.find_all('div', class_='label')[2].span.text
        verify_list = verify_text.replace('= ','').strip().split()
        num_map = {'Zero': 0,'One': 1,'Two': 2, 'Three':3,'Four':4,'Fine':5,'Six':6,'Seven':7,'Eight': 8, 'Nine':9, 'Ten': 10}
        verify_num = [verify_list[0], verify_list[2]]
        for index, num in enumerate(verify_num):
            if num.isdigit():
                verify_num[index] = int(num)
            elif num in num_map:
                verify_num[index] = num_map[num]
            else:
                echo(0, 'Error', index)
                # return False
        verify_code = 0
        error = True

        operation = verify_list[1]
        if operation == '+' or operation == 'plus' or operation == 'add' or operation == 'multiplied':
            verify_code = verify_num[0] + verify_num[1]
            error = False
        if operation == '-' or operation == 'minus':
            verify_code = verify_num[0] - verify_num[1]
            error = False
        if operation == 'X' or operation == 'multiplication':
            verify_code = verify_num[0] * verify_num[1]
            error = False
        if error:
            echo(0, 'Error', operation)
        if not os.path.exists('%spassage'%data_dir):
            echo(0, 'gather passage not exist!!!')
            return
        with codecs.open('%spassage'%data_dir, 'r', encoding='utf-8') as f:
            passage = [index[:-1] for index in f.readlines()]
        data = {'Username': passage[0], 'Password': passage[1], 'Captcha': str(verify_code)}
        time.sleep(2.163)
        r = requests.session()
        r.cookies = cj.LWPCookieJar()
        login_req = r.post(login_url, headers=headers, data=data, verify=False)

コード例 #8

0

ファイルを表示

    def get_goods_id_first(self, origin_url, index):
        """
        get goods id first
        """

        origin_url = origin_url.replace('https', 'http')
        # first_result = get_request_proxy(origin_url, 0)
        first_result = basic_req(origin_url, 0, header=self.headers)

        if not first_result or len(first_result.find_all('script')) < 2:
            if can_retry(origin_url):
                self.get_goods_id_first(origin_url, index)
            return

        wait = first_result.find_all('script')[1].text
        if not '"title":"' in wait:
            return
        title = re.findall('"title":".*","',
                           wait)[0].split('","')[0].split('":"')[1]
        if title in self.title2map:
            self.goods_map[index] = self.title2map[title]
            self.url2goods[origin_url] = self.title2map[title]

            print(self.title2map[title])
        else:
            print(title)

コード例 #9

0

ファイルを表示

 def _getroom_id(self, next_to=True, proxy=True):
     ''' get av room id '''
     url = self.ROOM_INIT_URL % self._av_id
     html = get_request_proxy(url, 0) if proxy else basic_req(url, 0)
     head = html.find_all('head')
     if not len(head) or len(
             head[0].find_all('script')) < 4 or not '{' in head[0].find_all(
                 'script')[3].text:
         if can_retry(url):
             self._getroom_id(proxy=proxy)
         else:
             self._getroom_id(proxy=False)
         next_to = False
     if next_to:
         script_list = head[0].find_all('script')[3].text
         script_begin = script_list.index('{')
         script_end = script_list.index(';')
         script_data = script_list[script_begin:script_end]
         json_data = json.loads(script_data)
         if self._p == -1 or len(json_data['videoData']['pages']) < self._p:
             self._room_id = json_data['videoData']['cid']
         else:
             self._room_id = json_data['videoData']['pages'][self._p -
                                                             1]['cid']
         print('Room_id:', self._room_id)

コード例 #10

0

ファイルを表示

ファイル: titleviews.py プロジェクト: baidong1980/spider

    def get_request(self, url, types):

        result = basic_req(url, 1)

        if not result:
            if can_retry(url):
                self.get_request(url, types)
            return
        return result

コード例 #11

0

ファイルを表示

ファイル: titleviews.py プロジェクト: baidong1980/spider

    def get_request_v3(self, url, types):

        result = basic_req(url, 0)

        if result is None or not result or not len(
                result.find_all('p', class_='content')):
            if can_retry(url):
                self.get_request_v3(url, types)
            return
        return result

コード例 #12

0

ファイルを表示

ファイル: bilibili.py プロジェクト: baidong1980/spider

    def basic_view(self, url: str, times: int, types: int):
        ''' press have no data input '''
        url = self.AV_URL
        if types == 1:
            html = get_request_proxy(url, 0)
        else:
            html = basic_req(url, 0)

        if html == False and times < 5:
            self.basic_view(url, times + 1, types)

コード例 #13

0

ファイルを表示

    def get_request_proxy(self, url:str, types:int, data=None, test_func=None, header=None):
        """
        use proxy to send requests, and record the proxy cann't use
        @types S0XY: X=0.->get;   =1.->post;
                     Y=0.->html;  =1.->json; =2.->basic
                     S=0.->basic ;=1.->ss

        support failured retry && failured auto record
        """

        httptype = url[4] == 's'
        ss_type = types // 1000
        types %= 1000
        if ss_type:
            proxylist = self.proxylists_ss if httptype else self.proxylist_ss
        else:
            proxylist = self.proxylists if httptype else self.proxylist

        if not len(proxylist):
            if self.Db.db:
                echo(0, 'Proxy pool empty!!! Please check the db conn & db dataset!!!')
            proxies = {}
        else:
            index = random.randint(0, len(proxylist) - 1)
            proxies_url = proxylist[index]
            proxies = {type_map[httptype]: proxies_url}

        try:
            result = basic_req(url, types, proxies, data, header)
            if not test_func is None:
                if not test_func(result):
                    if self.check_retry(url):
                        self.get_request_proxy(
                            url, types + 1000 * ss_type, data, test_func)
                    else:
                        self.failuredtime[url] = 0
                        return
                else:
                    return result
            else:
                return result

        except:
            self.cannotuseip[random.randint(0, MAXN)] = proxies_url

            if proxies_url in proxylist:
                proxylist.remove(proxylist.index(proxies_url))

            if not len(self.cannotuseip.keys()) % 10:
                self.cleancannotuse()

            if self.check_retry(url):
                self.get_request_proxy(url, types + 1000 * ss_type, data, test_func)
            else:
                return

コード例 #14

0

ファイルを表示

 def request_text(self, url):
     ''' requests text '''
     req = basic_req(url, 2)
     if req is None:
         echo(0, url)
         if can_retry(url):
             self.request_text(url)
         else:
             return ''
     else:
         echo(1, url)
         return req.text

コード例 #15

0

ファイルを表示

    def get_goods_second(self, url, index):

        second_result = basic_req(url, 0, header=self.headers)
        # second_result = get_request_proxy(url, 0)

        if not second_result or not len(second_result.find_all('input')):
            if can_retry(url):
                self.get_goods_second(url, index)
            return
        goods_id = second_result.find_all('input')[6]['value']
        print(goods_id)
        self.goods_map[index] = goods_id

コード例 #16

0

ファイルを表示

    def basic_press(self, url, times, types):
        """
        press have no data input
        """
        url = url + str(int(round(time.time() * 1000)))
        if types == 1:
            html = get_request_proxy(url, 1)
        else:
            html = basic_req(url, 1)

        if html == False and times < 5:
            self.basic_press(url, times + 1, types)

コード例 #17

0

ファイルを表示

 def load_city_list(self):
     ''' load city list '''
     text = basic_req(self.MDD_URL, 3)
     city_list = re.findall(
         '/travel-scenic-spot/mafengwo/(.*?).html" target="_blank">(.*?)(</a>|<span)',
         text)
     id2map = {
         int(ii[0]): ii[1].strip()
         for ii in city_list if ii[0].isdigit()
     }
     city_list = id2map.keys()
     self.city_list = city_list
     self.id2map = id2map

コード例 #18

0

ファイルを表示

 def search_goods_once(self, goods_name, index):
     if not os.path.exists('%scookie_alimama' % data_dir):
         print('alimama cookie not exist!!!')
         return
     with codecs.open('%scookie_alimama' % data_dir, 'r',
                      encoding='utf-8') as f:
         cookie = f.readlines()
     url_list = [
         'https://pub.alimama.com/items/search.json?auctionTag=&perPageSize=50&shopTag=&_tb_token_=',
         cookie[1][:-1], '&pvid=', cookie[2][:-1], '&t=',
         str(int(round(time.time() * 1000))), '&_t=',
         str(int(round(time.time() * 1000))), '&q=', goods_name
     ]
     headers = {
         'pragma':
         'no-cache',
         'X-Requested-With':
         'XMLHttpRequest',
         'cache-control':
         'no-cache',
         'Cookie':
         '',
         'Content-Type':
         'application/x-www-form-urlencoded;charset=UTF-8',
         'Accept':
         'application/json, text/javascript, */*; q=0.01',
         "Accept-Encoding":
         "",
         "Accept-Language":
         "zh-CN,zh;q=0.9",
         "User-Agent":
         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36",
     }
     headers['Cookie'] = cookie[0][:-1]
     ca = basic_req(''.join(url_list), 2, header=headers)
     if ca.status_code != 200 or not 'data' in ca.json():
         if can_retry(''.join(url_list)):
             self.search_goods_once(goods_name, index)
         return
     page_list = ca.json()['data']['pageList']
     title = [
         '||'.join(
             [str(index['auctionId']), goods_name,
              str(index['zkPrice'])]) for index in page_list
     ][0]
     self.goods_name[index] = title
     print(title)

コード例 #19

0

ファイルを表示

 def get_hotel_detail(self):
     ''' get hotel detail '''
     params = {
         **self.generate_other_params(), 'callback':
         self.generate_callback(16),
         'eleven': self.generate_eleven(),
         '_': int(time.time() * 1000)
     }
     params_list = [
         '{}={}'.format(ii, (jj if not jj is None else ''))
         for ii, jj in params.items()
     ]
     url = '{}?{}'.format(HOTEL_ROOMLIST_FOR_DETAIL_URL,
                          '&'.join(params_list))
     echo(2, 'XHR url', url)
     text = basic_req(url, 1)
     return text

コード例 #20

0

ファイルを表示

ファイル: getproxy.py プロジェクト: 123liwei/spider

    def xiciproxy(self, page):
        """
        xici proxy http://www.xicidaili.com/nn/{page}
        The first proxy I use, but now it can not use it mostly.
        """

        if not str(page).isdigit():
            print("Please input num!")
            return []

        version = begin_time()
        url = 'http://www.xicidaili.com/nn/%d'
        for index in range(1, page + 1):
            html = basic_req(url % (index), 0)
            tem = html.find_all('tr')
            for index in range(1, len(tem)):
                tds = tem[index].find_all('td')
                ip = tds[5].text.lower()
                self.waitjudge.append(ip + '://' + tds[1].text + ':' +
                                      tds[2].text)
        self.threadjude()
        end_time(version)

コード例 #21

0

ファイルを表示

    def get_playlist_id(self, classify, offset):
        """
        get playlist id from classify
        """

        host = 'https://music.163.com'
        allclassify = classify == '全部风格'
        url = host + self.classifylist[classify] + (
            '?' if allclassify else
            '&') + 'order=hot&limit=35&offset=' + str(offset)
        html = basic_req(url, 0)

        if not html:
            if can_retry(url):
                self.get_playlist_id(classify, offset)
            return []
        alist = html.find_all('a', class_='icon-play')
        if not len(alist):
            if can_retry(url):
                self.get_playlist_id(classify, offset)
        for index in alist:
            self.playlists.append(index['data-res-id'])

コード例 #22

0

ファイルを表示

    def load_collect_once(self, index):
        """
        load taobao collect
        """
        baseurl = 'https://shoucang.taobao.com/item_collect_n.htm?t='
        url = baseurl + str(int(round(time.time() * 1000)))
        if index:
            url += 'ifAllTag=0&tab=0&tagId=&categoryCount=0&type=0&tagName=&categoryName=&needNav=false&startRow=' + \
                str(30 * index)

        collect_html = basic_req(url, 0)
        if collect_html != True and collect_html != False:
            collect_list = collect_html.find_all(
                'li',
                class_=[
                    "J_FavListItem g-i-item fav-item ",
                    "J_FavListItem g-i-item fav-item isinvalid",
                    "J_FavListItem g-i-item fav-item istmall ",
                    "J_FavListItem g-i-item fav-item istmall isinvalid"
                ])
            print(len(collect_list))
        if collect_html == True or collect_html == False or not len(
                collect_list):
            if can_retry(baseurl + str(index), index):
                self.load_collect_once(index)
            return
        text = []
        for collect in collect_list:
            data_id = collect['data-id']
            # data_ownerid = collect['data-ownerid']
            title = collect.find_all('a', class_='img-item-title-link')[0].text

            price = collect.find_all(
                'div', class_='g_price')[0].strong.text if len(
                    collect.find_all('div', class_='g_price')) else '0'
            text.append("||".join([data_id, title, price]))

        self.collect[index] = text

コード例 #23

0

ファイルを表示

    def load_rank_index(self, index: int, day_index: int):
        ''' load rank '''
        changeHeaders({'Referer': self.RANKING_URL % (index, day_index)})
        url = self.RANKING_URL % (index, day_index)
        html = basic_req(url, 0)
        rank_list = html.find_all('li', class_='rank-item')

        now_av_id = []
        wait_check_public = []
        rank_map = {}

        for av in rank_list:
            av_href = av.find_all('a')[0]['href']
            av_id = int(re.findall('av.*', av_href)[0][2:-1])
            now_av_id.append(av_id)
            if not self.check_type(av_id):
                continue
            rank = int(av.find_all('div', class_='num')[0].text)
            score = int(
                av.find_all('div', class_='pts')[0].find_all('div')[0].text)
            name = av.find_all('span')[2].text
            if self.add_av(av_id, rank, score):
                rank_map[av_id] = [rank, score, name, index, day_index]
        ''' check assign av rank '''
        for ii in self.assign_ids:
            if not ii in self.public:
                wait_check_public.append(ii)
            if not ii in self.last_view and not ii in self.rank_map:
                self.rank_map[ii] = []
        have_assign = len([0 for ii in self.assign_ids if ii in now_av_id]) > 0
        ''' check tid type '''
        threading_public = []
        for ii in rank_map.keys():
            work = threading.Thread(target=self.check_type_req, args=(ii, ))
            threading_public.append(work)
        for work in threading_public:
            work.start()
        for work in threading_public:
            work.join()

        for ii, jj in rank_map.items():
            if self.check_type(ii) != True:
                continue
            if not ii in self.public:
                wait_check_public.append(ii)
            self.last_check[ii] = int(time.time())
            self.rank_map[ii] = jj
        ''' load public basic data '''
        threading_public = []
        for ii in wait_check_public:
            work = threading.Thread(target=self.public_data, args=(
                ii,
                0,
            ))
            threading_public.append(work)
        for work in threading_public:
            work.start()
        for work in threading_public:
            work.join()
        ''' begin monitor '''
        threading_list = []
        for ii, jj in self.public.items():
            if not ii in self.public_list and jj[0] + one_day > int(
                    time.time()):
                work = threading.Thread(target=self.public_monitor,
                                        args=(
                                            ii,
                                            0,
                                        ))
                threading_list.append(work)
        for work in threading_list:
            work.start()
        return have_assign

コード例 #24

0

ファイルを表示

ファイル: bilibili.py プロジェクト: baidong1980/spider

    def one_click_bilibili(self, url: str, times: int, types: int):
        ''' press have no data input '''
        url = self.AV_URL
        if types == 1:
            html = get_request_proxy(url, 0)
        else:
            html = basic_req(url, 0)

        if html == False:
            if times < 5:
                self.basic_view(url, times + 1, types)
            return
        times = 0
        url_1 = self.CLICK_NOW_URL
        if types == 1:
            json_1 = get_request_proxy(url_1, 1)
        else:
            json_1 = basic_req(url_1, 1)
        if not json_1 is None:
            print(json_1)

        if not self.have_error(json_1, 1):
            if times < 2:
                self.one_click_bilibili(url, times + 1, types)
            return
        times = 0
        url = self.CLICK_WEB_URL
        data = {
            'aid': self.basic_av_id,
            'cid': '',
            'part': '1',
            'mid': str(random.randint(10000000, 19999999)),
            'lv': '2',
            'ftime': '',
            'stime': json_1['data']['now'],
            'jsonp': 'jsonp',
            'type': '3',
            'sub_type': '0'
        }
        if types == 1:
            json_req = get_request_proxy(url, 11, data)
        else:
            json_req = basic_req(url, 11, data=data)
        if not json_req is None:
            print(json_req)

        if not self.have_error(json_req):
            if times < 2:
                self.one_click_bilibili(url, times + 1, types)
            return
        times = 0
        url_3 = self.REPORT_HEARTBEAT_URL
        data_3 = {
            'aid': self.basic_av_id,
            'cid': '',
            'mid': data['mid'],
            'csrf': '',
            'played_time': '0',
            'realtime': '0',
            'start_ts': json_1['data']['now'],
            'type': '3',
            'dt': '2',
            'play_type': '1'
        }

        if types == 1:
            json_3 = get_request_proxy(url_3, 11, data_3)
        else:
            json_3 = basic_req(url_3, 11, data=data_3)
        if not json_3 is None:
            print(json_3)

        if not self.have_error(json_3) and times < 2:
            self.one_click_bilibili(url, times + 1, types)
        print('finish.')
        self.finish += 1

コード例 #25

0

ファイルを表示

ファイル: bilibili.py プロジェクト: baidong1980/spider

    def load_rank_index(self, index: int, day_index: int):
        ''' load rank '''
        changeHeaders({'Referer': self.AV_URL})
        url = self.RANKING_URL % (index, day_index)
        text = basic_req(url, 3)
        rank_str = re.findall('window.__INITIAL_STATE__=(.*?);', text)
        if not len(rank_str):
            if can_retry(url):
                self.load_rank_index(index, day_index)
            return False
        rank_map = json.loads(rank_str[0])
        rank_list = rank_map['rankList']

        now_av_id = []
        wait_check_public = []
        rank_map = {}

        for ii, rank in enumerate(rank_list):
            av_id = int(rank['aid'])
            need_params = [
                'pts', 'author', 'mid', 'play', 'video_review', 'coins',
                'duration', 'title'
            ]
            temp_rank_list = [
                ii, *[rank[ii] for ii in need_params], index, day_index
            ]
            now_av_id.append(av_id)
            if not self.check_type(av_id):
                continue
            self.check_rank_rose(av_id, temp_rank_list)
            if self.add_av(av_id, ii, temp_rank_list[1]):
                rank_map[av_id] = temp_rank_list
        ''' check assign av rank '''
        for ii in self.assign_ids:
            if not ii in self.public:
                wait_check_public.append(ii)
            if not ii in self.last_view and not ii in self.rank_map:
                self.rank_map[ii] = []
        have_assign = len([0 for ii in self.assign_ids if ii in now_av_id]) > 0
        ''' check tid type '''
        threading_public = []
        for ii in rank_map.keys():
            work = threading.Thread(target=self.check_type_req, args=(ii, ))
            threading_public.append(work)
        for work in threading_public:
            work.start()
        for work in threading_public:
            work.join()

        for ii, jj in rank_map.items():
            if self.check_type(ii) != True:
                continue
            if not ii in self.public:
                wait_check_public.append(ii)
            self.last_check[ii] = int(time.time())
            self.rank_map[ii] = jj
        ''' load public basic data '''
        threading_public = []
        for ii in wait_check_public:
            work = threading.Thread(target=self.public_data, args=(
                ii,
                0,
            ))
            threading_public.append(work)
        for work in threading_public:
            work.start()
        for work in threading_public:
            work.join()
        ''' begin monitor '''
        threading_list = []
        for ii, jj in self.public.items():
            if not ii in self.public_list and jj[0] + one_day > int(
                    time.time()):
                work = threading.Thread(target=self.public_monitor,
                                        args=(
                                            ii,
                                            0,
                                        ))
                threading_list.append(work)
        for work in threading_list:
            work.start()
        return have_assign

コード例 #26

0

ファイルを表示

    def generate_eleven(self):
        ################################################################
        #
        #   [generate eleven] version 19.4.21(Test ✔️) write by gunjianpan
        #
        #   1. random generate 15 bit param `callback`;
        #   2. use callback request OCEANBALL -> get origin js;
        #   3. eval once -> (match array, and then chr() it) -> decoder js;
        #   4. replace document and windows(you also can use execjs & jsdom);
        #   5. warning you should replace `this` to some params,
        #      Otherwise, you will get `老板给小三买了包， 却没有给你钱买房`
        #   6. finsh, return, and joint params;
        #
        ################################################################

        callback = self.generate_callback(15)
        now_time = int(time.time() * 1000)
        url = '{}?callback={}&_={}'.format(OCEANBALL_URL, callback, now_time)
        referer_url = HOTEL_DETAIL_URL % self.default_hotel_id
        changeHeaders({'Referer': referer_url})
        oceanball_js = basic_req(url, 3)
        array = re.findall(r'\(\[(.*)\],', oceanball_js)[0].split(',')
        array = [int(ii) for ii in array]
        offset = int(re.findall(r'item-(\d*?)\)', oceanball_js)[0])
        ''' String.fromCharCode '''
        oe = ''.join([chr(ii - offset) for ii in array])
        ''' replace window[callback] callback function '''
        replace_str = re.findall(r'{}\(new.*\)\);'.format(callback), oe)[0]
        eleven_params = re.findall(
            r'{}\(new.*\+ (.*?) \+.*\)\);'.format(callback), oe)[0]
        replaced_str = 'return {};'.format(eleven_params)
        oe = oe.replace(replace_str, replaced_str)
        oe = oe.replace('\'', '"').replace('\r', '')
        oe = oe.replace(';!', 'let aaa = ', 1)

        replace = '''
        function(){let href='https://hotels.ctrip.com/hotel/4889292.html';
            a={'documentElement': {'attributes':{}}};
            b={};
            function c(){};
            userAgent ='Chrome/73.0.3682.0';
            geolocation = 0;
        '''
        ''' replace document & windown & navigator '''
        oe = oe.replace('document.body.innerHTML.length',
                        '888').replace('document.body.innerHTML', '""')
        oe = oe.replace('document.createElement("div")', '{}')
        oe = oe.replace('window.HTMLSpanElement',
                        'c').replace('document.createElement("span")', '1')
        oe = oe.replace('window.location.href',
                        'href').replace('location.href', 'href')
        oe = oe.replace('navigator.', '')
        oe = oe.replace('new Image().', '')
        oe = oe.replace('document.all', '0').replace('document.referrer', '""')
        oe = oe.replace('this || ', '')
        oe = oe.replace('window["document"]', 'a')

        oe = oe.replace('document', 'a').replace('window', 'b')
        oe = oe.replace('function(){', replace, 1)
        ''' eval script '''
        eleven = js2py.eval_js(oe)
        echo(1, 'eleven', eleven)
        return eleven

コード例 #27

0

ファイルを表示

    def bulk_import_alimama(self):
        """
        bulk import alimama
        """

        version = begin_time()
        if not os.path.exists('%scollect_wyy' % data_dir):
            print('Collect File not exist!!!')
            return
        with codecs.open('%scollect_wyy' % data_dir, 'r',
                         encoding='utf-8') as f:
            goods = f.readlines()
        self.goods_candidate = [index.split('||')[0] for index in goods]
        goods_len = len(self.goods_candidate)

        self.headers = {
            'pragma':
            'no-cache',
            'X-Requested-With':
            'XMLHttpRequest',
            'cache-control':
            'no-cache',
            'Cookie':
            '',
            'Content-Type':
            'application/x-www-form-urlencoded;charset=UTF-8',
            'Accept':
            'application/json, text/javascript, */*; q=0.01',
            "Accept-Encoding":
            "",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36",
            'Origin':
            'http://pub.alimama.com',
            'Referer':
            'http://pub.alimama.com/promo/search/index.htm?q=%E7%AC%AC%E5%9B%9B%E5%8D%81%E4%B9%9D%E5%A4%A9%2019%E6%98%A5%E5%AD%A3&_t=1550891362391'
        }
        if not os.path.exists('%scookie_alimama' % data_dir):
            print('alimama cookie not exist!!!')
            return
        with codecs.open('%scookie_alimama' % data_dir, 'r',
                         encoding='utf-8') as f:
            cookie = f.readlines()
        url_list = [
            'https://pub.alimama.com/favorites/group/newList.json?toPage=1&perPageSize=40&keyword=&t=',
            str(int(round(time.time() * 1000))), '&_tb_token_=',
            cookie[1][:-1], '&pvid=', cookie[2][:-1]
        ]
        url = ''.join(url_list)
        self.headers['Cookie'] = cookie[0][:-1]
        self.headers['Host'] = url.split('/')[2]

        group_list = basic_req(url, 2, header=self.headers)

        if group_list.status_code != 200 or group_list.json(
        )['info']['message'] == 'nologin':
            print('group_list error')
            return
        group_list = group_list.json()['data']['result']
        group_list = [index['id'] for index in group_list]

        print(group_list)

        assert len(group_list) > (goods_len - 1) // 200

        threadings = []
        for index in range((goods_len - 1) // 200 + 1):
            work = threading.Thread(target=self.bulk_import_alimama_once,
                                    args=(
                                        index,
                                        group_list[index],
                                    ))
            threadings.append(work)
        for work in threadings:
            work.start()
        for work in threadings:
            work.join()
        end_time(version)