Example #1
0
def crawler_page_html(page_url, retry=True):

    # raise gen.Return( open("test.html", "rb").read() )   # DEBUG
    req_data = {
        "url": page_url,
        "method": "GET",
        "headers": {
            "accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "accept-encoding":
            "gzip, deflate",
            "accept-language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "cache-control":
            "max-age=0",
            "upgrade-insecure-requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
        },
        "proxy_host": "192.168.206.1",
        "proxy_port": 1080,
        "request_timeout": 30,
    }
    response = yield tool.http_request(req_data)
    if response.code == 599 and retry:
        response = yield tool.http_request(req_data)

    if response.code != 200:
        raise gen.Return("")

    raise gen.Return(response.body)
Example #2
0
def crawler_page_html(page_url, retry=True):

    req_data = {
        "url": page_url,
        "method": "GET",
        "headers": {
            "accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "accept-encoding":
            "gzip, deflate",
            "accept-language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "cache-control":
            "max-age=0",
            "upgrade-insecure-requests":
            "1",
            "user-agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3282.119 Safari/537.36",
        },
        "proxy_host": None,
        "proxy_port": None,
        "request_timeout": 30,
    }
    response = yield tool.http_request(req_data)
    if response.code == 599 and retry:
        response = yield tool.http_request(req_data)

    if response.code != 200:
        # raise Exception("http status code %s,%s" % (response.code, response.error))
        raise gen.Return("")

    raise gen.Return(response.body)
Example #3
0
def get_post_page_head(post_url, postfix_list):
    post_page_return_code, post_page_data = tool.http_request(post_url)[:2]
    # 不带后缀的可以访问,则直接返回页面
    # 如果无法访问,则依次访问带有后缀的页面
    if post_page_return_code != 1:
        for postfix in postfix_list:
            temp_post_url = post_url + "/" + urllib2.quote(postfix)
            post_page_return_code, post_page_data = tool.http_request(temp_post_url)[:2]
            if post_page_return_code == 1:
                break
    if post_page_data is not None:
        return tool.find_sub_string(post_page_data, "<head", "</head>", 3)
    else:
        return None
Example #4
0
    def construct_request(self):

        yield self.set_new_proxy()

        if self.proxy_item:
            logging.debug("Forward request via upstream proxy %s" % self.proxy_item)

            # Changing the `X-Forwarded-For` (hope it works for some cases)
            if self.proxy_item["anoy"] != True:
                self.request.headers["Via"] = "NaN"
                self.request.headers["X-Forwarded-For"] = self.proxy_item["proxy_host"]

            raise tornado.gen.Return( tool.http_request({
                "url": self.request.uri,
                "method": self.request.method,
                "headers": self.request.headers,
                "body": self.request.body or None,
                "proxy_host": self.proxy_item["proxy_host"],
                "proxy_port": self.proxy_item["proxy_port"],
                "request_timeout": 15,
                "follow_redirects": False,
                "allow_nonstandard_methods": True
            }) )
        else:
            logging.error("Proxy server error: No available proxy.")

            self.set_status(self.ERROR_STATUS_CODE)
            self.finish("Proxy server error:\n No available proxy.")

            raise tornado.gen.Return(None)
Example #5
0
def get_one_page_post_url_list(account_id, page_count):
    # http://moexia.lofter.com/?page=1
    index_page_url = "http://%s.lofter.com/?page=%s" % (account_id, page_count)
    index_page_return_code, index_page = tool.http_request(index_page_url)[:2]
    if index_page_return_code == 1:
        return re.findall('"(http://' + account_id + '.lofter.com/post/[^"]*)"', index_page)
    return None
Example #6
0
def check_invalid():
    # 获取存放路径
    config_path = os.path.join(os.getcwd(), "..\\common\\config.ini")
    config = robot.read_config(config_path)
    save_data_path = robot.get_config(config, "SAVE_DATA_PATH", "info/save.data", 3)
    save_data_dir = os.path.dirname(save_data_path)
    fee_save_data_path = os.path.join(save_data_dir, "fee.data")
    # 读取村存档中的收费相册列表
    if not os.path.exists(fee_save_data_path):
        log.step("收费相册存档不存在")
        return
    fee_save_data_file = open(fee_save_data_path, "r")
    fee_save_data = fee_save_data_file.read()
    fee_save_data_file.close()
    fee_album_id_list = fee_save_data.strip().split(" ")
    new_fee_album_id_list = []
    # 循环访问,判断相册是否已经被删除
    for fee_album_id in fee_album_id_list:
        album_url = "http://meituzz.com/album/browse?albumID=%s" % fee_album_id
        album_page_return_code, album_page = tool.http_request(album_url)[:2]
        if album_page_return_code == 1:
            if album_page.find("<title>相册已被删除</title>") == -1:
                new_fee_album_id_list.append(fee_album_id)
            else:
                log.step("第%s页相册已被删除" % fee_album_id)
    # 重新保存
    fee_save_data_file = open(fee_save_data_path, "w")
    fee_save_data_file.write(" ".join(new_fee_album_id_list) + " ")
    fee_save_data_file.close()
Example #7
0
def get_picasaweb_page_album_id(account_id, picasaweb_url):
    message_page_return_code, message_page = tool.http_request(picasaweb_url)[:2]
    if message_page_return_code == 1:
        # 查找picasaweb页的album id
        album_archive_url = "https://get.google.com/albumarchive/pwa/%s/album/" % account_id
        return tool.find_sub_string(message_page, 'href="%s' % album_archive_url, '"')
    return None
Example #8
0
def get_one_page_album(account_id, token):
    index_url = "https://plus.google.com/_/photos/pc/read/"
    post_data = 'f.req=[["posts",null,null,"synthetic:posts:%s",3,"%s",null],[%s,1,null],"%s",null,null,null,null,null,null,null,2]' % (account_id, account_id, GET_IMAGE_URL_COUNT, token)
    index_page_return_code, index_page = tool.http_request(index_url, post_data)[:2]
    if index_page_return_code == 1:
        return index_page
    return None
def akb(file_handle):
    for team_id in [1, 2, 3, 4, 12]:
        index_url = "http://www.akb48.co.jp/about/members/?team_id=" + str(team_id)
        return_code, page = tool.http_request(index_url)[:2]
        if return_code == 1:
            member_list_page = tool.find_sub_string(page, '<ul class="memberListUl">', '</ul>')
            if member_list_page:
                member_list = re.findall("<li>([\s|\S]*?)</li>", member_list_page)
                for member in member_list:
                    member = member.replace("<br />", "").replace("\n", "").replace("\r", "").replace("\t", "")
                    japanese_name = tool.find_sub_string(member, '<h4 class="memberListNamej">', '</h4>')
                    english_name = tool.find_sub_string(member, '<p class="memberListNamee">', '</p>')
                    team_find = re.findall('<h5 class="memberListTeam">([^<]*)</h5>', member)
                    if not japanese_name:
                        print "error japanese_name"
                        continue
                    if not english_name:
                        print "error english_name"
                        continue
                    if (team_id != 12 and len(team_find) != 1) or (team_id == 12 and len(team_find) != 2):
                        print "error team_find"
                        continue

                    japanese_name = japanese_name.replace(" ", "")
                    first_name, last_name = english_name.split(" ", 1)
                    team = team_find[0].strip().replace("  /", " / ")

                    file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team + "\n")
            else:
                print "error member_list_page"
Example #10
0
def get_one_page_blog(account_id, page_count):
    # http://blog.nogizaka46.com/asuka.saito
    blog_url = "http://blog.nogizaka46.com/%s/?p=%s" % (account_id, page_count)
    blog_return_code, blog_page = tool.http_request(blog_url)[:2]
    if blog_return_code == 1:
        return tool.find_sub_string(blog_page, '<div class="paginate">', '<div class="paginate">', 1)
    return None
Example #11
0
def get_store_info(info):

    # 获取 商品基本信息 & 价格
    store_api_url = STORE_URL % (
        info["itemid"],
        MY_AREA,
        info["venderId"],
        info["cat"],
        str(time.time()).replace(".", ""),
        JQNAME,
    )
    # ap(store_api_url)

    if not DEBUG:
        # response = requests.get(store_api_url, headers={}, timeout=16)
        response = yield tool.http_request({
            "url": store_api_url,
            "method": "GET",
            "headers": {
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
                "Accept-Encoding":
                "gzip, deflate, br",
                "Accept-Language":
                "en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6",
                # "Referer": store_api_url,
                # "Pragma": "no-cache",
                # "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
            }
        })
        # q.d()
        # ap("response:", response)
        open("store_api.js",
             "w").write(tool.try_decode_html_content(response.body))
    store_api_content = open("store_api.js", "r").read()
    store_api_content_json = get_jsonp_json(store_api_content)
    # ap(store_api_content_json)

    # 取 商家 名称
    vender_string = (store_api_content_json["stock"].get("self_D")
                     or store_api_content_json["stock"].get("D")
                     or {}).get("vender") or "自营"

    # 取plus的价格(一般更低)或者原价
    if store_api_content_json["stock"].get("jdPrice"):
        price = store_api_content_json["stock"]["jdPrice"].get(
            "tpp") or store_api_content_json["stock"]["jdPrice"]["p"]
        if store_api_content_json["stock"]["jdPrice"].get("sfp"):
            price = min(store_api_content_json["stock"]["jdPrice"].get("sfp"),
                        price)
    else:
        price = "-1.00"

    # q.d()

    return {
        "price": float(price),
        "vender": vender_string,
        "stock": store_api_content_json["stock"]["StockStateName"],
    }
Example #12
0
def get_one_page_follow_list(account_id, cursor=None):
    query_url = "https://www.instagram.com/query/"
    # node支持的字段:id,is_verified,followed_by_viewer,requested_by_viewer,full_name,profile_pic_url,username
    params = "nodes{username},page_info"
    if cursor is None:
        post_data = "q=ig_user(%s){follows.first(%s){%s}}" % (account_id, USER_COUNT_PER_PAGE, params)
    else:
        post_data = "q=ig_user(%s){follows.after(%s,%s){%s}}" % (account_id, cursor, USER_COUNT_PER_PAGE, params)
    # todo session id error
    # IGSCdaccb7f76627fa16a0d418f32a733030cb4cdeefaaddc5464a3da52eb8acfe06%3AID8fxYoOH96eMPpf4kEWwIhLA9ihMLuO%3A%7B%22_token_ver%22%3A2%2C%22_auth_user_id%22%3A3539660450%2C%22_token%22%3A%223539660450%3Amm50iieIxyG0NWWxuFifs0j23vhA5WpR%3Afd860ccd5c16e35eadf3e0946c00178b50fce7b45a9d09c62498dbbffdc8fa2b%22%2C%22asns%22%3A%7B%2247.89.39.193%22%3A45102%2C%22time%22%3A1480388199%7D%2C%22_auth_user_backend%22%3A%22accounts.backends.CaseInsensitiveModelBackend%22%2C%22last_refreshed%22%3A1480392303.831638%2C%22_platform%22%3A4%2C%22_auth_user_hash%22%3A%22%22%7D
    header_list = {
        "Referer": "https://www.instagram.com/",
        "X-CSRFToken": CSRF_TOKEN,
        "Cookie": "csrftoken=%s; sessionid=%s;" % (CSRF_TOKEN, SESSION_ID),
    }
    follow_list_return_code, follow_list_data = tool.http_request(query_url, post_data, header_list)[:2]
    if follow_list_return_code == 1:
        try:
            follow_list_data = json.loads(follow_list_data)
        except ValueError:
            pass
        else:
            if robot.check_sub_key(("follows",), follow_list_data):
                if robot.check_sub_key(("page_info", "nodes"), follow_list_data["follows"]):
                    if robot.check_sub_key(("end_cursor", "has_next_page"), follow_list_data["follows"]["page_info"]):
                        return follow_list_data["follows"]
    return None
Example #13
0
def get_one_page_post(coser_id, page_count):
    # http://bcy.net/u/50220/post/cos?&p=1
    post_url = "http://bcy.net/u/%s/post/cos?&p=%s" % (coser_id, page_count)
    post_page_return_code, post_page = tool.http_request(post_url)[:2]
    if post_page_return_code == 1:
        return post_page
    return None
Example #14
0
def get_image_url_list(cp_id, rp_id):
    # http://bcy.net/coser/detail/9299/36484
    rp_url = "http://bcy.net/coser/detail/%s/%s" % (cp_id, rp_id)
    rp_page_return_code, rp_page_response = tool.http_request(rp_url)[:2]
    if rp_page_return_code == 1:
        return re.findall("src='([^']*)'", rp_page_response)
    return None
def jkt(file_handle):
    index_url = "http://www.jkt48.com/member/list"
    return_code, page = tool.http_request(index_url)[:2]
    if return_code == 1:
        page = tool.find_sub_string(page, '<div id="mainCol">', "<!--end #mainCol-->", 1)
        start_index = 0
        start_index_list = []
        while start_index != -1:
            start_index = page.find('<a name="', start_index + 1)
            start_index_list.append(start_index)
        for i in range(0, len(start_index_list) - 1):
            start = start_index_list[i]
            end = start_index_list[i + 1]
            if end == -1:
                end = len(page)
            split_page = page[start: end]
            team_name = tool.find_sub_string(split_page, "<h2>", "</h2>")
            if team_name.find("Team") == -1:
                team_name = "Team kenkyusei"
            team_name = "JKT48 " + team_name
            member_list = re.findall('<div class="profileWrap">([\s|\S]*?)</div><!--/loop-->',split_page)
            for member in member_list:
                member = member.replace("<br>", "").replace("\n", "").replace("\r", "").replace("\t", "")
                japanese_name = english_name = tool.find_sub_string(member, 'alt="', '"')

                file_handle.write(japanese_name + "\t" + english_name + "\t" + team_name + "\n")
def ske(file_handle):
    split_list = {
        "SKE48 Team S": ("<!-- LIST - TEAM S -->", "<!-- //LIST - TEAM S -->"),
        "SKE48 Team KII": ("<!-- LIST - TEAM KII -->", "<!-- //LIST - TEAM KII -->"),
        "SKE48 Team E": ("<!-- LIST - TEAM E -->", "<!-- //LIST - TEAM E -->"),
        "SKE48 Team Kenkyusei": ("<!-- LIST - KENKYUSEI -->", "<!-- //LIST - KENKYUSEI -->")
    }
    index_url = "http://www.ske48.co.jp/profile/list.php"
    return_code, page = tool.http_request(index_url)[:2]
    if return_code == 1:
        for team_name in split_list:
            team_page = tool.find_sub_string(page, split_list[team_name][0], split_list[team_name][1])
            member_list = re.findall('<dl>([\s|\S]*?)</dl>', team_page)
            for member in member_list:
                member = member.replace("<br />", "").replace("\n", "").replace("\r", "").replace("\t", "")
                japanese_name_find = re.findall('<h3><a href="./\?id=[^"]*">([^<]*)</a></h3>', member)
                english_name = tool.find_sub_string(member, '<h3 class="en">', '</h3>')
                plus_text = tool.find_sub_string(member, '<li class="textPlus">', '</li>')
                if len(japanese_name_find) != 1:
                    print "error japanese_name_find"
                    continue
                if not english_name:
                    print "error english_name"
                    continue

                japanese_name = japanese_name_find[0].replace(" ", "")
                first_name, last_name = english_name.strip().title().split(" ", 1)
                if plus_text and plus_text.find("兼任") > 0:
                    team = team_name + " / " + plus_text.split("/")[-1].strip().replace("チーム", " Team ").replace("兼任", "")
                else:
                    team = team_name

                file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team + "\n")
def hkt(file_handle):
    index_url = "http://www.hkt48.jp/profile/"
    return_code, page = tool.http_request(index_url)[:2]
    if return_code == 1:
        team_find = re.findall('(<h3>[\s|\S]*?)<!-- / .contsbox --></div>', page)
        for team_page in team_find:
            team = tool.find_sub_string(team_page, "<h3>", "</h3>")
            if not team:
                print "error team"
                continue
            team = team.strip()
            member_list = re.findall("<li>([\s|\S]*?)</li>", team_page)
            for member in member_list:
                member = member.replace("<br />", "").replace("\n", "").replace("\r", "").replace("\t", "")
                name_find = re.findall('''<a href="/profile/[\d]*"><img src="[^"]*" alt="[^"]*" width="120" height="150" /><span class='name_j'>([^"]*)</span><span class='name_e'>([^<]*)</span></a> ''', member)
                if len(name_find) != 1:
                    print "error name_find"
                    continue
                japanese_name, english_name = name_find[0]
                team_plus_find = re.findall('<div class="team_j">([^<]*)</div>', member)
                team_name = team
                if len(team_plus_find) == 1:
                    if team_plus_find[0].find("兼任") >= 0:
                        team_name = team + " / " + team_plus_find[0].split("/")[-1].strip().replace("兼任", "")
                japanese_name = japanese_name.replace(" ", "")
                first_name, last_name = english_name.strip().title().split(" ", 1)

                file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team_name + "\n")
Example #18
0
def get_one_page_audio_list(account_id, page_type, page_count):
    # http://changba.com/member/personcenter/loadmore.php?userid=4306405&pageNum=1
    audio_album_url = "http://5sing.kugou.com/%s/%s/%s.html" % (account_id, page_type, page_count)
    audio_album_return_code, audio_album_page = tool.http_request(audio_album_url)[:2]
    if audio_album_return_code == 1:
        return re.findall('<a href="http://5sing.kugou.com/' + page_type + '/([\d]*).html" [\s|\S]*? title="([^"]*)">', audio_album_page)
    return None
Example #19
0
def get_api_info(account_name):
    photo_index_url = "https://www.flickr.com/photos/%s" % account_name
    photo_index_return_code, photo_index_page = tool.http_request(photo_index_url)[:2]
    if photo_index_return_code == 1:
        user_id = tool.find_sub_string(photo_index_page, '"nsid":"', '"')
        site_key = tool.find_sub_string(photo_index_page, '"site_key":"', '"')
        return {"user_id": user_id, "site_key": site_key}
    return None
Example #20
0
def unfollow(account_id):
    unfollow_url = "http://bcy.net/weibo/Operate/follow?"
    unfollow_post_data = {"uid": account_id, "type": "unfollow"}
    unfollow_return_code, unfollow_return_data = tool.http_request(unfollow_url, unfollow_post_data)[:2]
    if unfollow_return_code == 1:
        if int(unfollow_return_data) == 1:
            return True
    return False
Example #21
0
def get_suid(account_id):
    index_page_url = "http://www.miaopai.com/u/paike_%s" % account_id
    index_page_return_code, index_page = tool.http_request(index_page_url)[:2]
    if index_page_return_code == 1:
        suid = tool.find_sub_string(index_page, '<button class="guanzhu gz" suid="', '" heade="1" token="">+关注</button>')
        if suid:
            return suid
    return None
Example #22
0
def get_user_id(account_id):
    index_url = "http://changba.com/u/%s" % account_id
    index_return_code, index_page = tool.http_request(index_url)[:2]
    if index_return_code == 1:
        user_id = tool.find_sub_string(index_page, "var userid = '", "'")
        if user_id:
            return user_id
    return None
Example #23
0
def get_account_id(account_name):
    account_index_url = "https://twitter.com/%s" % account_name
    account_index_return_code, account_index_page = tool.http_request(account_index_url)[:2]
    if account_index_return_code == 1:
        account_id = tool.find_sub_string(account_index_page, '<div class="ProfileNav" role="navigation" data-user-id="', '">')
        if account_id:
            return account_id
    return None
Example #24
0
def follow(account_id):
    follow_url = "http://bcy.net/weibo/Operate/follow?"
    follow_post_data = {"uid": account_id, "type": "dofollow"}
    follow_return_code, follow_return_data = tool.http_request(follow_url, follow_post_data)[:2]
    if follow_return_code == 1:
        # 0 未登录,11 关注成功,12 已关注
        if int(follow_return_data) == 12:
            return True
    return False
Example #25
0
def unfollow_account(auth_token, account_id):
    unfollow_url = "https://twitter.com/i/user/unfollow"
    unfollow_data = {"user_id": account_id}
    header_list = {"Cookie": "auth_token=%s;" % auth_token, "Referer": "https://twitter.com/"}
    unfollow_return_code, unfollow_data = tool.http_request(unfollow_url, unfollow_data, header_list)[:2]
    if unfollow_return_code == 1:
        if robot.check_sub_key(("new_state",), unfollow_data) and unfollow_data["new_state"] == "not-following":
            return True
    return False
Example #26
0
def get_thread_author_post(thread_url):
    thread_return_code, thread_page, thread_response = tool.http_request(thread_url)
    if thread_return_code == 1:
        content_type = tool.get_response_info(thread_response.info(), "Content-Type")
        charset = tool.find_sub_string(content_type, "charset=")
        post_message = tool.find_sub_string(thread_page, '<td class="t_f" id="postmessage_', '<div id="comment_')
        post_message = post_message[post_message.find('">') + 2: post_message.rfind("</td>")]
        return post_message.decode(charset)
    return None
Example #27
0
def get_promote_info(info):

    # 获取价格以及 促销 & 券 & 礼物
    promote_api_url = PROMOTE_URL % (
        MY_AREA[0],
        MY_AREA[1],
        MY_AREA[2],
        info["itemid"],
        info["分类id"],
        int(time.time() * 1000),
    )

    # 获取页面html内容
    if not DEBUG:
        response = yield tool.http_request({
            "url": promote_api_url,
            "method": "GET",
            "headers": HEADERS
        })
        open("kaola.promopt_page.html",
             "w").write(tool.try_decode_html_content(response.body))

    item_content = open("kaola.promopt_page.html", "r").read()
    item_content = tool.json_load(item_content)

    # 这两个不是一模一样的吗
    skuPrice = item_content["data"].get(
        "skuPrice") or item_content["data"]["skuDetailList"][0]["skuPrice"]
    min_price = min(skuPrice["currentPrice"], skuPrice["kaolaPrice"],
                    skuPrice["suggestPrice"], skuPrice["marketPrice"])
    presale = item_content["data"].get(
        "depositGoodsAdditionalInfo"
    ) or item_content["data"]["skuDetailList"][0]["depositSkuAdditionalInfo"]
    if presale:
        min_price = presale.get("handPrice") or min_price

    current_store = item_content["data"].get(
        "goodsCurrentStore"
    ) or item_content["data"]["skuDetailList"][0]["skuStore"]["currentStore"]

    promotion_info = item_content["data"].get("promotionList") or item_content[
        "data"]["skuDetailList"][0]["promotionList"] or []
    promote = [[x["promotionContent"], x["promotionUrl"], "0000 ~ 0000"]
               for x in promotion_info]

    quan = item_content["data"].get("goodsCouponList") or []

    # q.d()

    return {
        "min_price": min_price,
        "current_store": current_store,
        "promote": promote,
        "quan": quan,
        "presale": bool(presale),
    }
Example #28
0
def check_big_image(image_url, big_2_small_list):
    if image_url in big_2_small_list:
        big_image_display_page_return_code, big_image_display_page = tool.http_request(big_2_small_list[image_url])[:2]
        if big_image_display_page_return_code == 1:
            temp_image_url = tool.find_sub_string(big_image_display_page, '<img src="', '"')
            if temp_image_url != "/img/expired.gif":
                return temp_image_url, False
            else:
                return image_url, True  # 如果有发现一个已经过期的图片,那么再往前的图片也是过期的,不用再检查了
    return image_url, False
Example #29
0
def get_one_page_diary_data(account_id, page_count):
    # http://www.keyakizaka46.com/mob/news/diarKiji.php?cd=member&ct=01&page=0&rw=20
    diary_page_url = "http://www.keyakizaka46.com/mob/news/diarKiji.php"
    diary_page_url += "?cd=member&ct=%02d&page=%s&rw=%s" % (int(account_id), page_count - 1, IMAGE_COUNT_PER_PAGE)
    diary_return_code, diary_page = tool.http_request(diary_page_url)[:2]
    if diary_return_code == 1:
        diary_page = tool.find_sub_string(diary_page, '<div class="box-main">', '<div class="box-sideMember">')
        if diary_page:
            return re.findall("<article>([\s|\S]*?)</article>", diary_page)
    return None
Example #30
0
def get_bbs_forum_url_list(index_url):
    index_return_code, index_page = tool.http_request(index_url)[:2]
    if index_return_code == 1:
        forum_find = re.findall('<a href="(forum-\w*-\d*.\w*)"[^>]*>([\S]*)</a>', index_page)
        host = index_url[0: index_url.rfind("/") + 1]
        forum_url_list = {}
        for forum_path, forum_name in forum_find:
            forum_url_list[host + forum_path] = forum_name
        return forum_url_list
    return None
Example #31
0
def save_video(ts_file_list, file_path):
    file_handle = open(file_path, "wb")
    for ts_file_url in ts_file_list:
        ts_file_return_code, ts_file_data = tool.http_request(ts_file_url)[:2]
        if ts_file_return_code == 1:
            file_handle.write(ts_file_data)
        else:
            return False
    file_handle.close()
    return True
Example #32
0
def get_ts_url_list(file_url, ts_file_list):
    file_return_code, file_data = tool.http_request(file_url)[:2]
    if file_return_code == 1:
        new_file_url_list = re.findall("(/ext_tw_video/[\S]*)", file_data)
        for new_file_url in new_file_url_list:
            new_file_url = "https://video.twimg.com%s" % new_file_url
            if new_file_url.split(".")[-1] == "m3u8":
                get_ts_url_list(new_file_url, ts_file_list)
            elif new_file_url.split(".")[-1] == "ts":
                ts_file_list.append(new_file_url)
Example #33
0
def get_ts_url_list(link_url):
    video_link_return_code, video_link_data = tool.http_request(link_url)[:2]
    if video_link_return_code == 1:
        ts_id_list = re.findall("([\S]*.ts)", video_link_data)
        prefix_url = link_url[:link_url.rfind("/") + 1]
        ts_file_list = []
        for ts_id in ts_id_list:
            ts_file_list.append(prefix_url + ts_id)
        return ts_file_list
    else:
        return None
Example #34
0
def get_one_forum_page_thread_url_list(forum_url):
    forum_return_code, forum_page = tool.http_request(forum_url)[:2]
    if forum_return_code == 1:
        forum_page = tool.find_sub_string(forum_page, '<div id="threadlist"', '<div id="filter_special_menu"', 1)
        thread_find = re.findall('<a href="(thread-\d*-1-1.\w*)" onclick="atarget\(this\)" class="s xst">([\S|\s]*?)</a>', forum_page)
        host = forum_url[0: forum_url.rfind("/") + 1]
        thread_url_list = {}
        for forum_path, forum_name in thread_find:
            thread_url_list[host + forum_path] = forum_name
        return thread_url_list
    return None
Example #35
0
def get_video_url_list(tweet_id):
    video_page_url = "https://twitter.com/i/videos/tweet/%s" % tweet_id
    video_page_return_code, video_page = tool.http_request(video_page_url)[:2]
    if video_page_return_code == 1:
        m3u8_file_url = tool.find_sub_string(video_page, "&quot;video_url&quot;:&quot;", "&quot;")
        if m3u8_file_url:
            m3u8_file_url = m3u8_file_url.replace("\\/", "/")
            ts_url_list = []
            get_ts_url_list(m3u8_file_url, ts_url_list)
            return "ts", ts_url_list
        vmap_file_url = tool.find_sub_string(video_page, "&quot;vmap_url&quot;:&quot;", "&quot;")
        if vmap_file_url:
            vmap_file_url = vmap_file_url.replace("\\/", "/")
            vmap_file_return_code, vmap_file = tool.http_request(vmap_file_url)[:2]
            if vmap_file_return_code:
                media_file_url = tool.find_sub_string(vmap_file, "<![CDATA[", "]]>")
                if media_file_url:
                    file_type = media_file_url.split(".")[-1].split("?")[0]
                    return file_type, media_file_url
    return "", []
Example #36
0
def construct_http_get(proxy_host, proxy_port, timeout):
    return tool.http_request({
        "url": URL,
        "method": "GET",
        "headers": {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3282.119 Safari/537.36",
        },
        "body": None,
        "proxy_host": proxy_host,
        "proxy_port": proxy_port,
        "request_timeout": timeout,
    })
Example #37
0
def crawler_page_html(page_url, retry=True):

    # raise gen.Return( open("test.html", "rb").read() )   # DEBUG

    req_data = {
        "url": page_url,
        "method": "GET",
        "headers": {
            "accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "accept-encoding":
            "gzip, deflate",
            "accept-language":
            "zh-CN,zh;q=0.9",
            "cache-control":
            "max-age=0",
            "upgrade-insecure-requests":
            "1",
            "user-agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
            "Connection":
            "keep-alive",
            "Cookie":
            "ASPSESSIONIDASCTATQD=HLEJHLFCBJGLBDACDDJMMAHI; UM_distinctid=16426d109c7134-042982a838072c-5b183a13-1fa400-16426d109c832a; ASPSESSIONIDACAQTRDQ=KDGLGOOAPFCDAAPFELIODBBD; CNZZDATA1256284042=1049911583-1529656073-http%253A%252F%252Fwww.89ip.cn%252F%7C1530003389",
        },
        "proxy_host": None,
        "proxy_port": None,
        "request_timeout": 30,
    }
    response = yield tool.http_request(req_data)
    if response.code == 599 and retry:
        response = yield tool.http_request(req_data)

    if response.code != 200:
        # raise Exception("http status code %s,%s" % (response.code, response.error))
        raise gen.Return("")

    # open("test.html", "wb").write(response.body)   # DEBUG
    raise gen.Return(response.body)
Example #38
0
def construct_http_post(proxy_host, proxy_port, timeout):
    return tool.http_request({
        "url": URL,
        "method": "POST",
        "headers": {
            "Content-Length": "17",
            "Content-Type": "application/x-www-form-urlencoded",
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3282.119 Safari/537.36",
        },
        "body": "username=bogeming",
        "proxy_host": proxy_host,
        "proxy_port": proxy_port,
        "request_timeout": timeout,
    })
Example #39
0
def get_presale_info(info):

    # 获取 商品基本信息 & 价格
    presale_api_url = PRESALE_URL % (
        info["itemid"],
        str(time.time()).replace(".", ""),
        JQNAME,
    )
    ap(presale_api_url)

    if not DEBUG:
        # response = requests.get(presale_api_url, headers={}, timeout=16)
        response = yield tool.http_request({
            "url": presale_api_url,
            "method": "GET",
            "headers": {
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
                "Accept-Encoding":
                "gzip, deflate, br",
                "Accept-Language":
                "en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6",
                # "Referer": presale_api_url,
                # "Pragma": "no-cache",
                # "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
            }
        })
        # q.d()
        # ap("response:", response)
        open("presale_api.js",
             "w").write(tool.try_decode_html_content(response.body))
    presale_api_content = open("presale_api.js", "r").read()
    presale_api_content_json = get_jsonp_json(presale_api_content)
    # ap(presale_api_content_json)

    # q.d()

    if not presale_api_content_json.get("ret"):
        return None

    return {
        "currentPrice": presale_api_content_json["ret"]["currentPrice"],
        "presaleStartTime":
        presale_api_content_json["ret"]["presaleStartTime"],
        "presaleEndTime": presale_api_content_json["ret"]["presaleEndTime"],
        # "balanceBeginTime": presale_api_content_json["ret"]["balanceBeginTime"],
        # "balanceEndTime": presale_api_content_json["ret"]["balanceEndTime"],
    }
Example #40
0
def get_base_info(item):

    # 获取页面html内容
    if not DEBUG:
        response = yield tool.http_request({
            "url": item["url"],
            "method": "GET",
            "headers": HEADERS
        })
        open("kaola.base_url_page.html",
             "w").write(tool.try_decode_html_content(response.body))
    item_content = open("kaola.base_url_page.html", "r").read()
    item_content_lines = item_content.split("\n")
    icat = next(
        (i for (i, x) in enumerate(item_content_lines) if "$addGoods" in x),
        -1)
    info_text = item_content_lines[icat + 1:icat + 12]

    for i, line in enumerate(info_text):
        if "," in info_text[i]:
            info_text[i] = info_text[i][:info_text[i].index(",")]
            info_text[i] = info_text[i].replace("'", "").strip()
        else:
            ap("[WARN]:", "Something unexpected happened.")
            info_text[i] = ""

    info = {
        "分类id": info_text[0],
        "品牌id": info_text[1],
        "商品名称": info_text[2],
        "itemid": info_text[3],
        "商品售价": info_text[4],
        # "商品图片": info_text[5],
        "分类名": info_text[6],
        "品牌名": info_text[7],
        "商品库存": info_text[8],
        "网络价": info_text[9],
        # "收藏人数": info_text[10],
    }

    return info
Example #41
0
def get_base_info(item):

    # 获取页面html内容
    if not DEBUG:
        # response = requests.get(item["url"], headers={}, timeout=16)
        response = yield tool.http_request({
            "url": item["url"],
            "method": "GET",
            "headers": {
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
                "Accept-Encoding":
                "gzip, deflate, br",
                "Accept-Language":
                "en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6",
                "Referer":
                item["url"],
                "Pragma":
                "no-cache",
                "User-Agent":
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
            }
        })
        # q.d()
        open("content_page.html",
             "w").write(tool.try_decode_html_content(response.body))
    item_content = open("content_page.html", "r").read()

    item_content_lines = item_content.split("\n")
    icat = next(
        (i for (i, x) in enumerate(item_content_lines) if "cat: [" in x), -1)
    info = get_item_neighbor(item_content_lines, icat)

    for line in item_content_lines[:20]:
        if "<title>" in line:
            info["name"] = re.sub(
                r"""([\W]*<title>|[【][^【]*[】][^】]*</title>[\W]*$)""", "", line)

    return info
Example #42
0
def get_base_info(item):

    # 获取页面html内容
    if not DEBUG:
        response = yield tool.http_request({
            "url": item["url"],
            "method": "GET",
            "headers": HEADERS
        })
        open("yanxuan.base_url_page.html",
             "w").write(tool.try_decode_html_content(response.body))

    item_content = open("yanxuan.base_url_page.html", "r").read()
    item_content_lines = item_content.split("\n")
    icat = next(
        (i for (i, x) in enumerate(item_content_lines) if "\"item\":" in x),
        -1)
    info_text = item_content_lines[icat][7:-1]
    info_json = tool.json_load(info_text)
    # info_text = info_text.replace("\"item\":", "")
    # if info_text[-1] == ",":
    #     info_text = info_text[0:-1]

    if item.get("iid"):
        item_info = next(
            (x for x in info_json["skuList"] if x["id"] == item["iid"]), {})
    else:
        item_info = info_json["skuList"][item["index"]]

    if not item_info:
        return None

    promote_info = item_info.get("hdrkDetailVOList")
    if item_info.get("couponShortNameList"):
        quan_info = item_info.get("couponShortNameList")
    elif item_info.get("shortCouponList"):
        quan_info = [x["displayName"] for x in item_info["shortCouponList"]]
    else:
        quan_info = None

    price = min(item_info["retailPrice"], item_info["counterPrice"],
                item_info["calcPrice"], item_info["preSellPrice"])

    if item_info.get("spmcBanner"):
        spmc_price = float(item_info["spmcBanner"].get("spmcPrice") or 0)
        price = spmc_price > 0 and min(spmc_price, price) or price

    if item_info.get("detailPromBanner"):
        activity_price = float(
            item_info["detailPromBanner"].get("activityPrice") or 0)
        price = activity_price > 0 and min(activity_price, price) or price

    info = {
        "name": item_info["skuTitle"],
        "iid": item_info["id"],
        "promote":
        [[x["name"], x["huodongUrlPc"], "0 ~ 0"] for x in promote_info],
        "quan": quan_info,
        "price": price,
        "store": item_info["sellVolume"],
    }

    return info
Example #43
0
def get_promote_info(info):

    # 获取价格以及 促销 & 券 & 礼物
    promote_api_url = PROMOTE_URL % (
        info["itemid"],
        MY_AREA,
        info["shopId"],
        info["venderId"],
        info["cat"].replace(",", "%2C"),
        info["price"],
        str(time.time()).replace(".", "")[:-3],
        JQNAME,
    )

    # promote_api_url = """https://cd.jd.com/promotion/v2?callback=jQuery5415158&skuId=65610440044&area=19_1601_3635_0&shopId=10131385&venderId=10252350&cat=1672%%2C2599%%2C12078&isCanUseDQ=1&isCanUseJQ=1&platform=0&orgType=2&jdPrice=299.00&appid=1&_=%s""" % str(time.time()).replace(".", "")[:-3]
    # promote_api_url = """https://cd.jd.com/promotion/v2?callback=jQuery4255721&skuId=65610440044&area=19_1601_3635_0&shopId=10131385&venderId=10252350&cat=1672%%2C2599%%2C12078&isCanUseDQ=1&isCanUseJQ=1&platform=0&orgType=2&jdPrice=299.00&appid=1&_=%s""" % str(time.time()).replace(".", "")[:-3]
    # ap(promote_api_url)

    if not DEBUG:
        # response = requests.get(promote_api_url, headers={}, timeout=16)
        response = yield tool.http_request({
            "url": promote_api_url,
            "method": "GET",
            "headers": {
                # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
                # "Accept-Encoding": "gzip, deflate, br",
                # "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6",
                "accept":
                "*/*",
                "accept-encoding":
                "gzip, deflate, br",
                "accept-language":
                "en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6,ja;q=0.5",
                "Referer":
                "https://item.jd.com/65610440044.html",
                "Pragma":
                "no-cache",
                # 额,必须要有 cookie 了
                "cookie":
                "__jdv=76161171|direct|-|none|-|1614594740019; __jdu=1614594740018869872184; areaId=19; ipLoc-djd=19-1601-3633-0; PCSYCityID=CN_440000_440100_440106; shshshfpa=00883b4f-d3c1-1602-7cd6-17731ed20a6e-1614594741; shshshfpb=m8UQnw74GyqJycpcp0lvCLg%3D%3D; __jda=122270672.1614594740018869872184.1614594740.1614594740.1614594740.1; __jdc=122270672; 3AB9D23F7A4B3C9B=RE4QF44JWCVUXEC7MQAZGA24NVF27LEI6CEQC4P7SABGXROC4ZDLKLWQBR6ULUZOEYHS5I7WMZBDNH5KDNWYC7VZFY; shshshfp=0263d234510f0c11eede903101b88cca; shshshsID=4122aa91f19c1d2e5fbd4fbec3deab0d_3_1614594756699; __jdb=122270672.4.1614594740018869872184|1.1614594740",
                "sec-fetch-dest":
                "script",
                "sec-fetch-mode":
                "no-cors",
                "sec-fetch-site":
                "same-site",
                # "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
                "User-Agent":
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192 Safari/537.36",
            }
        })
        open("promote_api.js",
             "w").write(tool.try_decode_html_content(response.body))
    promote_api_content = open("promote_api.js", "r").read()

    promote_api_content_json = get_jsonp_json(promote_api_content)
    # ap(promote_api_content_json)

    # 行间广告
    ads_strings = [
        x["ad"].replace("<", "&lt;").replace(">", "&gt;")
        for x in promote_api_content_json.get("ads") or []
    ]

    # 促销活动
    promote_strings = map(
        lambda x: [
            x["content"],
            GET_PROMOTE_URL % x["pid"].split("_")[0],
            "%s ~ %s" % (tool.get_datetime_from_stamp(x["st"]),
                         tool.get_datetime_from_stamp(x["d"])),
        ], promote_api_content_json["prom"]["pickOneTag"])
    promote_strings = list(promote_strings)
    # ap(promote_strings)

    # 赠品 礼物
    gift_strings = []
    for tag in promote_api_content_json["prom"]["tags"]:
        if "gifts" in tag:
            gift_string = map(lambda x: [x["nm"], CONTENT_URL % x["sid"]],
                              tag["gifts"])
            gift_string = list(gift_string)
            gift_strings.append([
                tag["name"],
                tool.get_datetime_from_stamp(tag["d"]), gift_string
            ])
        elif "name" in tag:
            gift_strings.append(
                [tag["name"],
                 tool.get_datetime_from_stamp(tag["d"])])

    # 返券 返点
    feedback_strings = ""
    if promote_api_content_json.get("quan"):
        feedback_url = promote_api_content_json["quan"]["actUrl"] or (
            MFDETAIL % promote_api_content_json["quan"]["activityId"])
        if feedback_url[:2] == "//":
            feedback_url = "https:%s" % feedback_url
        feedback_strings = [
            feedback_url, promote_api_content_json["quan"]["title"]
        ]

    # 领取优惠券 使用优惠券
    quan_strings = []
    if promote_api_content_json.get("skuCoupon"):
        for item in promote_api_content_json["skuCoupon"]:
            quan_string = item.get("allDesc") or "满%s减%s" % (item["quota"],
                                                             item["discount"])
            quan_strings.append([
                quan_string,
                "%s ~ %s" %
                (item.get("beginTime") or "", item.get("endTime") or "")
            ])
            quan_strings[-1].append(item.get("key"))
            quan_strings[-1].append(item.get("url") or "")

    # q.d()

    return {
        "promote": promote_strings,
        "gift": gift_strings,
        "quan": quan_strings,
        "feedback": feedback_strings,
        "ads": ads_strings,
    }
Example #44
0
    def get_page(self, group, i):
        """
            []
        """
        page_url = PAGE_STRUCT % (group, i * 25)
        tool.aprint("doing:", page_url)

        res = yield tool.http_request({
            "url": page_url,
            "method": "GET",
            "headers": {
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
                "Accept-Encoding":
                "gzip, deflate, br",
                "Accept-Language":
                "en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6",
                "Referer":
                page_url,
                "Pragma":
                "no-cache",
                "User-Agent":
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
            }
        })
        open("www.douban.com.html", "wb").write(res.body)
        content = open("www.douban.com.html", "r").read()

        # news_list
        content_dom = pyquery.parse_content_to_dom(content)
        content_ele = content_dom.find("#content").find("table>tr:gt(0)")

        result_list = []
        for i, content in pyquery.iter_eles(content_ele):
            title = content.find("td").eq(0).find("a").attr("title")
            href = content.find("td").eq(0).find("a").attr("href")

            author = content.find("td").eq(1).find("a").text()
            author_href = content.find("td").eq(1).find("a").attr("href")

            comment = content.find("td").eq(2).text()
            date = content.find("td").eq(3).text()

            result_list.append({
                "group": group,
                "group_name": self.name_list[group],
                "title": title,
                "href": href,
                "author": author,
                "comment": comment or "0",
            })
            # date: 2014-05-31 || 08-23 15:29
            if ":" in date:
                date = "%s-%s:00" % (tool.get_date_string()[:4], date)
            else:
                date = "%s 00:00:00" % (date)
            result_list[-1]["date"] = date
            result_list[-1]["id"] = int(href.split("/")[-2])
            result_list[-1]["author_id"] = author_href.split("/")[-2]
            result_list[-1]["price"] = self.get_price_from_item(
                result_list[-1])

        tool.aprint("result:", len(result_list))
        # return result_list
        raise tornado.gen.Return(result_list)