Esempi in Python per echo, esempi in Python per util.util.echo

Esempio n. 1

0

Mostra file

 def update_tpwd(self, mode: int = 0, is_renew: bool = True, a_id: str = None):
     update_num = 0
     for article_id, jj in self.article_list.items():
         if a_id is not None and article_id != a_id:
             continue
         for o_tpwd, (num_iid, title, domain, tpwd, _, _, url) in jj.items():
             c = jj[o_tpwd]
             if (
                 is_renew
                 and self.URL_DOMAIN[1] not in url
                 and self.URL_DOMAIN[2] not in url
                 and self.URL_DOMAIN[10] not in url
             ):
                 renew_type = 2 if url in self.URL_DOMAIN[5] else 1
                 origin_tpwd = self.convert2tpwd(url, title)
                 if origin_tpwd is None:
                     origin_tpwd = tpwd
             else:
                 renew_type = 0
                 origin_tpwd = tpwd
             if num_iid == "" or domain == 16:
                 c = (
                     *c[:2],
                     16,
                     origin_tpwd,
                     1 if renew_type == 0 else 2,
                     *c[-2:],
                 )
             else:
                 c = self.generate_tpwd(
                     title, int(num_iid), origin_tpwd, renew_type, c, mode
                 )
             self.article_list[article_id][o_tpwd] = c
             update_num += int(c[2] < 15 or (renew_type and not mode))
     echo(2, "Update {} Tpwd Info Success!!".format(update_num))

Esempio n. 2

0

Mostra file

 def update_title(self, article_id: str):
     self.tpwd_map[article_id] = {
         ii[3]: {"content": ii[1], "item_id": ii[0]}
         for ii in self.article_list[article_id].values()
     }
     no_title = [
         ii for ii, jj in self.tpwd_map[article_id].items() if "title" not in jj
     ]
     time = 0
     while len(no_title) and time < 5:
         title_list = [
             self.tpwd_exec.submit(self.get_item_title, article_id, ii)
             for ii in no_title
         ]
         echo(1, article_id, "need get title:", len(title_list))
         list(as_completed(title_list))
         time += 1
         no_title = [
             ii for ii, jj in self.tpwd_map[article_id].items() if "title" not in jj
         ]
     update_num = len(
         [
             1
             for ii, jj in self.tpwd_map[article_id].items()
             if "title" in jj and jj["content"] != jj["title"]
         ]
     )
     echo(2, "Update", article_id, update_num, "Title Success!!!")
     self.update_article2db(article_id)

Esempio n. 3

0

Mostra file

 def generate_tpwd(
     self, title: str, num_iid: int, renew_tpwd: str, renew_type: int, c: dict, mode: int
 ):
     goods = self.get_dg_material(title, num_iid)
     if goods is None or not len(goods):
         echo(0, "goods get", 'error' if goods is None else 'empty', ':', title, num_iid)
         return (*c[:2], 17, renew_tpwd, 1 if renew_type == 0 else 2, *c[-2:])
     goods = goods[0]
     if "ysyl_click_url" in goods and len(goods["ysyl_click_url"]):
         url = goods["ysyl_click_url"]
     elif "coupon_share_url" in goods and len(goods["coupon_share_url"]):
         url = goods["coupon_share_url"]
     else:
         url = goods["url"]
     url = "https:{}".format(url)
     commission_rate = int(goods["commission_rate"])
     commission_type = goods["commission_type"]
     tpwd = self.convert2tpwd(url, title)
     if tpwd is None:
         echo(0, "tpwd error:", tpwd)
         return (*c[:2], 18, renew_tpwd, 1 if renew_type == 0 else 2 * c[-2:])
     if mode:
        return (*c[:3], tpwd, commission_rate, commission_type, c[-1]) 
     if renew_type == 1:
         return (*c[:3], tpwd, 2, commission_type, c[-1])
     return (*c[:3], tpwd, commission_rate, commission_type, c[-1])

Esempio n. 4

0

Mostra file

 def load_article(self, article_id: str, mode: int = 0, is_load2db: bool = True):
     if mode:
         self.get_share_info(article_id)
         self.load_list2db()
         return
     if article_id not in self.tpwds:
         article = self.basic_youdao(article_id)
         tpwds = list({ii: 0 for ii in regex.findall(self.TPWD_REG, article)})
         self.tpwds[article_id] = tpwds
     else:
         tpwds = self.tpwds[article_id]
     if article_id not in self.tpwd_map:
         self.tpwd_map[article_id] = {}
     time = 0
     au_list = []
     no_type = [
         ii
         for ii, jj in self.tpwd_map[article_id].items()
         if "type" not in jj or jj["item_id"] is None
     ]
     while (
         len(self.tpwd_map[article_id]) < len(tpwds) or (len(no_type) and not time)
     ) and time < 5:
         thread_list = [ii for ii in tpwds if not ii in self.tpwd_map[article_id]]
         echo(1, article_id, "tpwds len:", len(tpwds), "need load", len(thread_list))
         thread_list = [
             self.tpwd_exec.submit(self.decoder_tpwd_once, article_id, ii)
             for ii in thread_list
         ]
         list(as_completed(thread_list))
         no_type = [
             ii
             for ii, jj in self.tpwd_map[article_id].items()
             if "type" not in jj or jj["item_id"] is None
         ]
         au_list.extend(
             [
                 self.tpwd_exec.submit(self.decoder_tpwd_url, article_id, ii)
                 for ii in no_type
             ]
         )
         time += 1
     list(as_completed(au_list))
     no_title = [
         ii for ii, jj in self.tpwd_map[article_id].items() if "title" not in jj
     ]
     time = 0
     while len(no_title) and time < 5:
         title_list = [
             self.tpwd_exec.submit(self.get_item_title, article_id, ii)
             for ii in no_title
         ]
         echo(1, article_id, "need get title:", len(title_list))
         list(as_completed(title_list))
         time += 1
         no_title = [
             ii for ii, jj in self.tpwd_map[article_id].items() if "title" not in jj
         ]
     if is_load2db:
         self.load_article2db(article_id)

Esempio n. 5

0

Mostra file

    def load_gather(self):
        """
        load gather proxy pool text
        If failured, you should reactive the cookie.
        """
        headers = {
            "Host": "www.gatherproxy.com",
            "Origin": "http://www.gatherproxy.com",
            "Referer": "http://www.gatherproxy.com/proxylist/anonymity/?t=Transparent",
            "Cookie": "_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57",
            "Content-Type": get_content_type(),
            "Accept": get_accept("html"),
        }
        url = "http://www.gatherproxy.com/subscribe/infos"
        try:
            sid_url_req = requests.get(url, headers=headers, verify=False, timeout=10)
        except:
            return
        sid_url_html = BeautifulSoup(sid_url_req.text, "html.parser")
        sid_url = sid_url_html.find_all("div", class_="wrapper")[1].find_all("a")[0][
            "href"
        ]
        if len(sid_url.split("sid=")) < 2:
            echo("0|warning", "cookie error")
            self.get_cookie()
            self.load_gather()
            return
        sid = sid_url.split("sid=")[1]
        sid_url = "http://www.gatherproxy.com" + sid_url

        data = {"ID": sid, "C": "", "P": "", "T": "", "U": "0"}
        gatherproxy = requests.post(sid_url, headers=headers, data=data, verify=False)
        with codecs.open(data_dir + "gatherproxy", "w", encoding="utf-8") as f:
            f.write(gatherproxy.text)

Esempio n. 6

0

Mostra file

    def comment_check_schedule(self, av_id: int, comment: int):
        ''' schedule comment check thread '''

        for pn in range(1, (comment - 1) // 20 + 2):
            if not self.comment_next[av_id]:
                return
            echo(2, 'Comment check, av_id:', av_id, 'pn:', pn)
            self.check_comment_once(av_id, pn)
        comment = [
            self.comment[av_id][k] for k in sorted(self.comment[av_id].keys())
        ]
        basic = [
            ','.join([str(jj) for jj in ii['basic']]) for ii in comment
            if 'basic' in ii
        ]
        replies = []
        for ii in comment:
            if not 'replies' in ii:
                continue
            parent_rpid = ii['basic'][0]
            replies_t = ii['replies']
            for jj in replies_t:
                jj[0] = '%s-%s' % (str(parent_rpid), str(jj[0]))
                replies.append(','.join([str(kk) for kk in jj]))
        with codecs.open('%s%d_comment.csv' % (comment_dir, av_id),
                         'w',
                         encoding='utf-8') as f:
            f.write('\n'.join(basic) + '\n')
            f.write('\n'.join(replies) + '\n')

Esempio n. 7

0

Mostra file

File: upBilibili.py Progetto: zhujuanzhu/spider

    def check_comment_once(self,
                           av_id: str,
                           pn: int,
                           sort: int,
                           root: int = -1,
                           ps: int = 10):
        """ check comment once """
        comment = self.get_comment_info(av_id, pn, sort, root, ps)
        if comment is None:
            return
        if root != -1:
            echo(
                "2|debug",
                "Comment check, av_id:",
                av_id,
                "pn:",
                pn,
                "sort:",
                sort,
                "root:",
                root,
                "ps:",
                ps,
            )
        else:
            echo("2|debug", "Comment check, av_id:", av_id, "pn:", pn, "sort:",
                 sort)
        hots = comment["hots"]
        replies = comment["replies"]
        if pn > 1 or root != -1:
            wait_check = replies
        else:
            wait_check = replies if hots is None else [*hots, *replies]
        if root == -1:
            wait_check = [{
                **jj, "idx": ii + 1
            } for ii, jj in enumerate(wait_check)]
        else:
            wait_check = [{
                **jj, "idx": "reply-{}".format(ii + 1)
            } for ii, jj in enumerate(wait_check)]

        for ii in wait_check:
            info = {"basic": self.get_comment_detail(ii, av_id, pn, sort)}
            rpid = info["basic"][0]
            crep = ii["replies"]
            idx = ii["idx"]

            if not crep is None:
                info["replies"] = [
                    self.get_comment_detail(
                        {
                            **kk, "idx": "{}-{}".format(idx, ww + 1)
                        },
                        av_id,
                        pn,
                        sort,
                        rpid,
                    ) for ww, kk in enumerate(crep)
                ]

Esempio n. 8

0

Mostra file

 def js_compile_sn(self, prepare_map):
     ''' js compile sn '''
     wait_js = '<script>' + self.result_js + '</script>'
     sn = self.js_compile.call('analysis_js', wait_js, self.slat,
                               prepare_map)
     echo(2, '_sn', sn)
     return sn

Esempio n. 9

0

Mostra file

File: upBilibili.py Progetto: zhujuanzhu/spider

    def public_monitor(self, bv_id: str):
        """ a monitor """
        self.public["L"].append(bv_id)
        created, mid = self.public["T"][bv_id]
        self.get_star_num(mid)
        self.check_rank_v2(bv_id)
        time.sleep(5)
        follower = self.star["T"][mid] if mid in self.star["T"] else 0
        data1 = self.data_v2[bv_id] if bv_id in self.data_v2 else {}
        sleep_time = created + one_day - int(time_stamp())
        if sleep_time < 0:
            return
        echo("4|debug", "Monitor Begin %s" % (bv_id))
        time.sleep(sleep_time)
        self.get_star_num(mid)
        self.check_rank_v2(bv_id)
        time.sleep(5)
        follower_2 = self.star["T"][mid] if mid in self.star["T"] else 0
        data2 = self.data_v2[bv_id] if bv_id in self.data_v2 else []

        data = [
            time_str(created),
            bv_id,
            follower,
            follower_2,
            *list(data.values()),
            *list(data2.values()),
        ]
        with codecs.open(data_dir + "public.csv", "a", encoding="utf-8") as f:
            f.write(",".join([str(ii) for ii in data]) + "\n")

Esempio n. 10

0

Mostra file

    def prepare_js(self):
        ''' prepare js '''
        pre_text = basic_req(self.JD_URL, 3)
        INDEX_JS_URL = re.findall(r'src=.*index\.js.*" t',
                                  pre_text)[0].split('"')[1]
        origin_js = basic_req(INDEX_JS_URL, 3)
        ''' decoder js '''
        decode_js = codecs.unicode_escape_decode(origin_js)[0]
        ''' params replace '''
        replace_list_str = decode_js.split(';')[2]
        empty_index = replace_list_str.index(' ') + 1
        begin_index = replace_list_str.index('=[') + 2
        end_index = replace_list_str.index(']')
        replace_list = replace_list_str[begin_index:end_index].split(',')
        rp = replace_list_str[empty_index:begin_index - 2]
        for ii, jj in enumerate(replace_list):
            decode_js = decode_js.replace('{}[{}]'.format(rp, ii), jj)
        self.slat = replace_list[46].replace('"', '')
        echo(2, 'salt', self.slat)
        ''' load to local '''
        with open(decoder_js_path, 'w') as f:
            f.write(';\n'.join(decode_js.split(';')))
        ''' del function about ajax '''
        del_str = re.findall(r'_.{6,10}\["ajaxPrefilter.*\)\}\}\)', decode_js)
        del_begin_index = decode_js.index(del_str[0])

        result_js = decode_js[:del_begin_index] + \
            decode_js[del_begin_index + len(del_str[0]):]

        result_js = decode_js[:del_begin_index] + \
            decode_js[del_begin_index + len(del_str[0]):]
        self.result_js = result_js
        self.js_compile = execjs.compile(open(hotel_js_path).read())
        echo(1, 'Load hotel index js success!!!')

Esempio n. 11

0

Mostra file

    def load_spot_once(self, pn=1, city_id=10186):
        ''' load spot once '''
        data = {
            'sAct': 'KMdd_StructWebAjax|GetPoisByTag',
            'iMddid': city_id,
            'iTagId': 0,
            'iPage': pn,
        }
        data = self.load_sn(data)
        print(data)
        req = proxy_req(self.AJAX_ROUTER_URL, 11, data=data)
        if req is None or not 'data' in req or not 'list' in req['data']:
            if can_retry('{}{}{}'.format(self.AJAX_ROUTER_URL, city_id, pn)):
                self.load_spot_once(pn, city_id)
            return
        spot_list = req['data']['list']
        spot_pn = req['data']['page']
        spot_tmp = re.findall('<h3>.*?(.*?)</h3>', spot_list)
        try:
            total_pn = int(re.findall('共<span>(.*?)</span>', spot_pn)[0])
        except Exception as e:
            total_pn = 1
            echo(0, 'City id:', city_id, 'Page:', pn, spot_pn, e)

        if city_id not in self.spot_result:
            self.spot_result[city_id] = spot_tmp
        else:
            self.spot_result[city_id] += spot_tmp
        self.spot_pn[city_id] = total_pn

Esempio n. 12

0

Mostra file

def req_ip66():
    ''' 66ip.cn js decoder '''
    header['Cookie'] = generate_cookie()

    req_text = basic_req(IP66_URL, 3, header=header)
    echo(2, req_text)
    return req_text

Esempio n. 13

0

Mostra file

File: getproxy.py Progetto: Marin111/spider-1

    def initproxy(self):
        """
        init proxy list
        """

        results = self.Db.select_db(self.select_list)
        self.proxylist = []
        self.proxylists = []
        self.proxylist_ss = []
        self.proxylists_ss = []
        if not results:
            echo(0,
                 'Please check db configure!!! The proxy pool cant use!!!>>>')
            return
        for index in results:
            if index[1] == 1:
                self.proxylists.append(index[0])
            elif index[1] == 2:
                self.proxylist.append(index[0])
                self.proxylist_ss.append(index[0])
            elif index[1] == 3:
                self.proxylists.append(index[0])
                self.proxylists_ss.append(index[0])
            else:
                self.proxylist.append(index[0])
        echo(2, len(self.proxylist), ' http proxy can use.')
        echo(2, len(self.proxylists), ' https proxy can use.')
        echo(2, len(self.proxylist_ss), ' ss http proxy can use.')
        echo(2, len(self.proxylists_ss), ' ss https proxy can use.')

Esempio n. 14

0

Mostra file

File: getproxy.py Progetto: onlyoneprogram/spider

    def db_can_use_proxy(self):
        """ test db have or not this data """

        results = self.select_proxy([ii[0] for ii in self.can_use_ip.values()])
        ss_len = len([1 for ii in self.can_use_ip.values() if ii[1] > 1])
        echo("2|info", "SS proxies", ss_len)

        insert_list = []
        update_list = []
        ip_map = {}
        if results != False:
            for ip_info in results:
                ip_map[ip_info[1]] = [ip_info[0], ip_info[2]]

            for ip_now in self.can_use_ip.values():
                http_type = ip_now[1]
                ip_now = ip_now[0]
                if ip_now in ip_map:
                    if ip_map[ip_now][1]:
                        update_list.append(
                            (ip_map[ip_now][0], ip_now, http_type, 0))
                else:
                    insert_list.append((ip_now, http_type))
            if len(insert_list):
                self.insert_proxy(insert_list)
            if len(update_list):
                self.update_proxy(update_list, 0)
        else:
            pass
        self.can_use_ip = {}

Esempio n. 15

0

Mostra file

 def update_article(self, article_id: str, article_body: str):
     p = self.share2article[article_id][-2].split("/")[-1]
     article_info = self.list_recent[p]
     data = {
         "fileId": p,
         "parentId": article_info["parentId"],
         "domain": article_info["domain"],
         "rootVersion": -1,
         "sessionId": "",
         "modifyTime": int(time_stamp()),
         "bodyString": article_body,
         "transactionId": p,
         "transactionTime": int(time_stamp()),
         "orgEditorType": article_info["orgEditorType"],
         "tags": article_info["tags"],
         "cstk": self.cstk,
     }
     url = self.SYNC_URL % ("push", self.cstk)
     req = basic_req(url, 11, data=data, header=self.get_ynote_web_header(1))
     if req is None or list(req.keys()) != [
         "entry",
         "meta",
         "effectedShareEntries",
         "forcePullVersion",
         "effected",
     ]:
         echo(
             "0|error",
             "Update atricle_id {} Error".format(article_id),
             req.json() if req is not None else "",
         )
         return False
     echo("1|warning", "Update atricle_id {} Success!!!".format(article_id))
     return True

Esempio n. 16

0

Mostra file

    def get_check(self):
        ''' check comment '''
        self.load_av_lists()
        av_id_list = [[ii['aid'], ii['comment']]
                      for ii in self.av_id_map.values()
                      if not re.findall(self.ignore_list, str(ii['aid']))]
        av_map = {ii['aid']: ii for ii in self.av_id_map.values()}
        self.comment_next = {ii: True for (ii, _) in av_id_list}
        if self.av_id_list and len(
                self.av_id_list) and len(self.av_id_list) != len(av_id_list):
            new_av_id = [
                ii for (ii, _) in av_id_list
                if not ii in self.av_id_list and not ii in self.del_map
            ]
            self.rank_map = {**self.rank_map, **{ii: [] for ii in new_av_id}}
            echo(1, new_av_id)
            for ii in new_av_id:
                shell_str = 'nohup ipython3 bilibili/bsocket.py {} %d >> log.txt 2>&1 &'.format(
                    ii)
                echo(0, shell_str)
                os.system(shell_str % 1)
                os.system(shell_str % 2)
                email_str = '{} av:{} was releasing at {}!!! Please check the auto pipeline.'.format(
                    av_map[ii]['title'], ii, time_str(av_map[ii]['created']))
                email_str2 = '{} {} is release at {}.\nPlease check the online & common program.\n\nBest wish for you\n--------\nSend from script by gunjianpan.'.format(
                    av_map[ii]['title'], time_str(av_map[ii]['created']),
                    self.BASIC_AV_URL % ii)
                send_email(email_str2, email_str)
                self.update_ini(ii)
                self.public[ii] = [av_map[ii]['created'], av_map[ii]['mid']]

        self.av_id_list = [ii for (ii, _) in av_id_list]
        now_hour = int(time_str(time_format='%H'))
        now_min = int(time_str(time_format='%M'))
        now_time = now_hour + now_min / 60
        if now_time > self.ignore_start and now_time < self.ignore_end:
            return
        if os.path.exists('{}comment.pkl'.format(comment_dir)):
            with codecs.open('{}comment.pkl'.format(comment_dir), 'rb') as f:
                self.comment = pickle.load(f)
        if self.assign_up_mid == -1:
            return

        threading_list = []
        for (ii, jj) in av_id_list:
            if ii not in self.comment:
                self.comment[ii] = {}
            work = threading.Thread(target=self.comment_check_schedule,
                                    args=(
                                        ii,
                                        jj,
                                    ))
            threading_list.append(work)
        for work in threading_list:
            work.start()
        for work in threading_list:
            work.join()
        with codecs.open('{}comment.pkl'.format(comment_dir), 'wb') as f:
            pickle.dump(self.comment, f)
        return av_id_list

Esempio n. 17

0

Mostra file

 def get_search_list(self, q: str):
     if self.proxy_can_use:
         base_url = self.API_PROXY_URL if random.random() * 10 > 7 else self.API_BASIC_URL
     else:
         base_url = self.API_BASIC_URL
     url = '{}search?q={}&count=66'.format(base_url, urllib.parse.quote(q))
     search_json = proxy_req(url, 1)
     if search_json is None or not 'subjects' in search_json:
         if search_json and 'code' in search_json:
             if search_json['code'] == 112:
                 self.proxy_can_use = False
         if can_retry(url, 6):
             time.sleep(random.random() *
                        (3.14 + random.randint(4, 10)) + 3.14)
             self.get_search_list(q)
         else:
             self.again_list.append(q)
             echo(0, url, 'Failed')
         return
     # echo(2, url, 'loaded')
     id2name = {int(ii['id']): ii['title']
                for ii in search_json['subjects']}
     self.movie_id2name = {**self.movie_id2name, **id2name}
     self.finish_list.append(q)
     if not len(self.finish_list) % 600:
         echo(2, len(self.finish_list), 'Finish...')
         dump_bigger(self.movie_id2name,
                     '{}douban_movie_id.pkl'.format(data_dir))

Esempio n. 18

0

Mostra file

 def history_rank(self, time_gap: int, now_info: list, av_id: int):
     echo(0, 'send history rank')
     time_gap = round(time_gap / 10) * 10
     history_map = {
         ii: jj
         for ii, jj in self.history_map[time_gap].items() if jj[1]
     }
     other_views = [int(ii[1]) for ii in history_map.values()]
     other_views_len = len(other_views)
     other_views.append(now_info[1])
     ov_sort_idx = np.argsort(-np.array(other_views))
     av_ids = list(history_map.keys())
     now_sorted = [
         jj for jj, ii in enumerate(ov_sort_idx) if ii == other_views_len
     ][0] + 1
     other_result = [(jj + 1, av_ids[ii])
                     for jj, ii in enumerate(ov_sort_idx[:4])
                     if ii != other_views_len]
     time_tt = self.get_time_str(time_gap)
     email_title = 'av{}发布{}, 本年度排名No.{}/{}, 播放量: {}, 点赞: {}, 硬币: {}, 收藏: {}, 弹幕: {}'.format(
         av_id, time_tt, now_sorted, len(other_views), now_info[1],
         now_info[2], now_info[3], now_info[4], now_info[7])
     email_title += self.get_history_rank(now_info)
     context = '{}\n\n'.format(email_title)
     for no, av in other_result[:3]:
         data_info = history_map[av]
         context += '{}, av{}, 本年度No.{}, 播放量: {}, 点赞: {}, 硬币: {}, 收藏: {}, 弹幕: {}{}, 发布时间: {}\n'.format(
             self.av_id_map[av]['title'].split('|',
                                               1)[0], av, no, data_info[1],
             data_info[2], data_info[3], data_info[4], data_info[7],
             self.get_history_rank(data_info),
             time_str(self.av_id_map[av]['created']))
     context += '\nBest wish for you\n--------\nSend from script by gunjianpan.'
     send_email(context, email_title)
     self.history_check_finish.append(round(time_gap / 10))

Esempio n. 19

0

Mostra file

 def load_comment_v2(self, movie_id: int, start: int):
     ''' load comment by proxy'''
     url = self.COMMENT_PROXY_URL % (movie_id, start)
     self.generate_cookie()
     comment_json = basic_req(url, 1)
     if comment_json is None or not 'comments' in comment_json:
         if not comment_json is None and 'code' in comment_json:
             if comment_json['code'] == 5000:
                 self.finish_list[(movie_id, start)] = 0
                 self.checkpoint()
             else:
                 comment_json['code'] == 112
                 self.proxy_can_use = False
                 echo(2, url, 'Failed')
                 self.again_list.append([movie_id, start])
         else:
             self.again_list.append([movie_id, start])
             echo(0, url, 'Failed')
         return
     comment_html = comment_json['comments']
     comment = {(movie_id, ii['author']['id']): [ii['author']['name'], ii['author']['id'],
                                                 ii['created_at'], ii['content'], '', ii['rating']['value']] for ii in comment_html}
     user_list = {ii['author']['id'] for ii in comment_html}
     self.user_info = {*self.user_info, *user_list}
     self.comment = {**self.comment, **comment}
     if len(user_list) == 100:
         self.more_user.append([movie_id, start + 100])
     self.finish_list[(movie_id, start)] = 0
     self.finish_list[(movie_id, start + 20)] = 0
     self.finish_list[(movie_id, start + 40)] = 0
     self.finish_list[(movie_id, start + 60)] = 0
     self.finish_list[(movie_id, start + 80)] = 0
     self.checkpoint()

Esempio n. 20

0

Mostra file

File: getproxy.py Progetto: onlyoneprogram/spider

 def load_proxies_list(self, types: int = 2):
     """ load proxies """
     SITES = [
         "http://www.proxyserverlist24.top/", "http://www.live-socks.net/"
     ]
     spider_pool = []
     self.waitjudge = []
     for site in SITES:
         self.get_other_proxies(site)
     self.gatherproxy(3)
     waitjudge = list(set(self.waitjudge))
     waitjudge_http = ["http://" + ii for ii in waitjudge]
     waitjudge_https = ["https://" + ii for ii in waitjudge]
     if not types:
         self.waitjudge = waitjudge_http
     elif types == 1:
         self.waitjudge = waitjudge_https
     else:
         self.waitjudge = waitjudge_http + waitjudge_https
     echo(
         "1|info",
         "-_-_-_-_-_-_-",
         len(waitjudge),
         "Proxies wait to judge -_-_-_-_-_-_-",
     )

Esempio n. 21

0

Mostra file

def load_index():
    ''' load index '''
    global movie_list
    version = begin_time()
    text = proxy_req(HOMEPAGE_URL, 3)
    if not len(text):
        if can_retry(HOMEPAGE_URL):
            load_index()
        return
    movie_list = re.findall('《(.*?)》', text)
    movie_more = re.findall('href="(.*?)">更多', text)
    for uri in movie_more:
        load_other(uri)

    threading_list = [threading.Thread(
        target=load_other, args=(ii,)) for ii in movie_another]
    shuffle_batch_run_thread(threading_list, 100)
    threading_list = [threading.Thread(
        target=load_other, args=(ii,)) for ii in movie_again]
    shuffle_batch_run_thread(threading_list, 100)
    # 对电影列表去重
    movie_list = set(movie_list)
    # 导出爬取的 电影列表
    out_path = 'dytt8_result.txt'
    with open(out_path, 'w') as f:
        f.write('\n'.join(movie_list))
    url_num = len([*movie_more, *movie_another]) + 1
    movie_num = len(movie_list)
    echo(1, 'Requests num: {}\nMovie num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format(
            url_num, movie_num, out_path, end_time(version, 0)))

Esempio n. 22

0

Mostra file

File: getproxy.py Progetto: onlyoneprogram/spider

    def init_proxy(self):
        """ init proxy list """

        results = self.Db.select_db(self.select_list)
        self.proxylist = []
        self.proxylists = []
        self.proxylist_ss = []
        self.proxylists_ss = []
        if not results:
            echo("0|error",
                 "Please check db configure!!! The proxy pool cant use!!!>>>")
            return
        for index in results:
            if index[1] == 1:
                self.proxylists.append(index[0])
            elif index[1] == 2:
                self.proxylist.append(index[0])
                self.proxylist_ss.append(index[0])
            elif index[1] == 3:
                self.proxylists.append(index[0])
                self.proxylists_ss.append(index[0])
            else:
                self.proxylist.append(index[0])
        echo("2|info", len(self.proxylist), " http proxy can use.")
        echo("2|info", len(self.proxylists), " https proxy can use.")
        echo("2|info", len(self.proxylist_ss), " ss http proxy can use.")
        echo("2|info", len(self.proxylists_ss), " ss https proxy can use.")

Esempio n. 23

0

Mostra file

def generate_cookie():
    ''' eval 66ip.cn test in 19.5.7 '''
    req = basic_req(IP66_URL, 2, header=header)
    basic_cookie = req.cookies.get_dict()

    ''' !important \b in py -> \x80 '''
    req_text = r'{}'.format(req.text)

    ''' get the script will be eval '''
    script_text = re.findall('<script>(.*?)</script>', req_text)[0]
    script_text = script_text.replace(
        '{eval(', '{aaa=').replace(');break', ';break')
    script_eval = r'{}'.format(js2py.eval_js(script_text + 'aaa'))
    echo(0, script_eval)

    try:
        ''' replace document & window '''
        params = re.findall(
            r'(__jsl_clearance=.*?)\'\+\(function\(\){(.*?join\(\'\'\))}\)\(\)', script_eval)
        wait_eval = params[0][1].replace(
            "document.createElement('div')", "{}").replace("", '')
        wait_replace = re.findall(
            r'=(.{1,5}\.firstChild\.href;)', wait_eval)[0]
        wait_eval = wait_eval.replace(wait_replace, '"http://www.66ip.cn/";')

        ''' eval & encoder cookie '''
        other_param = js2py.eval_js(
            'function ddd() {window={};' + wait_eval + '}ddd()')
        cookie = '{}; {}{}'.format(encoder_cookie(
            basic_cookie), params[0][0], other_param)
        echo(1, 'cookie', cookie)

        return cookie
    except:
        generate_cookie()

Esempio n. 24

0

Mostra file

 def tostring(self, e: int):
     if self.s < 0:
         echo('0|warning', '.s < 0', self.s)
         return '-'
     t = int(np.log2(e))
     r, o, i, a = (1 << t) - 1, False, '', self.t
     s = DB - a * DB % t
     if a > 0:
         if s < DB:
             n = self.E[a] >> s
             if n > 0:
                 o = True
                 i = g[n]
         a -= 1
         while a >= 0:
             if s < t:
                 n = (self.E[a] & (1 << s) - 1) << t - s
                 a -= 1
                 s += DB - t
                 n = n | (self.E[a] >> s)
             else:
                 s -= t
                 n = self.E[a] >> s & r
                 if s <= 0:
                     s += DB
                     a -= 1
             if n > 0:
                 o = True
             if o:
                 i += g[n]
     return i if o else '0'

Esempio n. 25

0

Mostra file

 def get_s_click_url(self, s_click_url: str):
     """ decoder s.click real jump url @validation time: 2019.10.23"""
     time.sleep(np.random.randint(0, 10))
     item_url = self.get_s_click_location(s_click_url)
     if item_url is None:
         echo(3, "s_click_url location Error..")
         return
     return self.get_item_detail(item_url)

Esempio n. 26

0

Mostra file

File: getproxy.py Progetto: onlyoneprogram/spider

 def get_download(self, types: str):
     url = "https://www.proxy-list.download/api/v0/get?l=en&t=" + types
     tt = basic_req(url, 1)
     if tt is None:
         return []
     tt_list = tt[0]["LISTA"]
     echo(1, "Get download", types, len(tt_list))
     return ["{}:{}".format(ii["IP"], ii["PORT"]) for ii in tt_list]

Esempio n. 27

0

Mostra file

File: bsocket.py Progetto: onlyoneprogram/spider

 def _getroom_id(self, proxy: bool = True):
     """ get av room id """
     cid = self.get_cid(self._bv_id)
     assert (
         cid and len(cid) >= self._p
     ), "Actual Page len: {} <=> Need Pages Num: {}".format(len(cid), self._p)
     self._room_id = int(cid[self._p - 1])
     echo(3, "Room_id:", self._room_id)

Esempio n. 28

0

Mostra file

 def load_picture_pipeline(self, file_path: str):
     mkdir('picture')
     tpk_list = self.tpwds[file_path]
     picture_url = [(self.tpwd_map[file_path][tpk]['picUrl'], idx) for idx, tpk in enumerate(tpk_list) if tpk in self.tpwd_map[file_path]]
     picture_url = [(ii, idx) for ii, idx in picture_url if not os.path.exists('picture/{}.jpg'.format(idx))]
     echo(1, 'Load {} picture Begin'.format(len(picture_url)))
     pp = [self.tpwd_exec.submit(self.load_picture, ii, jj) for ii, jj in picture_url]
     return pp

Esempio n. 29

0

Mostra file

 def checkpoint(self):
     checkpoint_num = 32 if self.proxy_can_use else 200
     if not len(self.finish_list.keys()) % checkpoint_num:
         echo(2, len(self.finish_list), 'Finish...')
         # dump_bigger(self.comment, '{}douban_comment.pkl'.format(data_dir))
         dump_bigger(self.user_info, '{}douban_user.pkl'.format(data_dir))
         dump_bigger(self.finish_list, '{}douban_cf.pkl'.format(data_dir))
         dump_bigger(self.more_user, '{}douban_more.pkl'.format(data_dir))
         dump_bigger(self.again_list, '{}douban_again.pkl'.format(data_dir))

Esempio n. 30

0

Mostra file

File: getproxy.py Progetto: onlyoneprogram/spider

 def get_free_proxy(self, url: str):
     req = basic_req(url, 2)
     if req is None:
         return []
     tt = req.text
     t_list = re.findall("<tr><td>(\d*\.\d*\.\d*\.\d*)</td><td>(\d*?)</td>",
                         tt)
     echo(1, "Get Free proxy List", url, len(t_list))
     return ["{}:{}".format(ii, jj) for ii, jj in t_list]