Esempio n. 1
0
def parse(url, c, ts):
    d = pq(common.visit(url))
    # 旧版获取src
    #src = d("video").find("source").attr("src")
    # 新版获取src
    src = d("#player_one script").text()
    src = src[20:-8]
    context = js2py.EvalJs()
    js_code = ''';var encode_version = 'sojson.v5', lbbpm = '__0x33ad7',  __0x33ad7=['QMOTw6XDtVE=','w5XDgsORw5LCuQ==','wojDrWTChFU=','dkdJACw=','w6zDpXDDvsKVwqA=','ZifCsh85fsKaXsOOWg==','RcOvw47DghzDuA==','w7siYTLCnw=='];(function(_0x94dee0,_0x4a3b74){var _0x588ae7=function(_0x32b32e){while(--_0x32b32e){_0x94dee0['push'](_0x94dee0['shift']());}};_0x588ae7(++_0x4a3b74);}(__0x33ad7,0x8f));var _0x5b60=function(_0x4d4456,_0x5a24e3){_0x4d4456=_0x4d4456-0x0;var _0xa82079=__0x33ad7[_0x4d4456];if(_0x5b60['initialized']===undefined){(function(){var _0xef6e0=typeof window!=='undefined'?window:typeof process==='object'&&typeof require==='function'&&typeof global==='object'?global:this;var _0x221728='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=';_0xef6e0['atob']||(_0xef6e0['atob']=function(_0x4bb81e){var _0x1c1b59=String(_0x4bb81e)['replace'](/=+$/,'');for(var _0x5e3437=0x0,_0x2da204,_0x1f23f4,_0x3f19c1=0x0,_0x3fb8a7='';_0x1f23f4=_0x1c1b59['charAt'](_0x3f19c1++);~_0x1f23f4&&(_0x2da204=_0x5e3437%0x4?_0x2da204*0x40+_0x1f23f4:_0x1f23f4,_0x5e3437++%0x4)?_0x3fb8a7+=String['fromCharCode'](0xff&_0x2da204>>(-0x2*_0x5e3437&0x6)):0x0){_0x1f23f4=_0x221728['indexOf'](_0x1f23f4);}return _0x3fb8a7;});}());var _0x43712e=function(_0x2e9442,_0x305a3a){var _0x3702d8=[],_0x234ad1=0x0,_0xd45a92,_0x5a1bee='',_0x4a894e='';_0x2e9442=atob(_0x2e9442);for(var _0x67ab0e=0x0,_0x1753b1=_0x2e9442['length'];_0x67ab0e<_0x1753b1;_0x67ab0e++){_0x4a894e+='%'+('00'+_0x2e9442['charCodeAt'](_0x67ab0e)['toString'](0x10))['slice'](-0x2);}_0x2e9442=decodeURIComponent(_0x4a894e);for(var _0x246dd5=0x0;_0x246dd5<0x100;_0x246dd5++){_0x3702d8[_0x246dd5]=_0x246dd5;}for(_0x246dd5=0x0;_0x246dd5<0x100;_0x246dd5++){_0x234ad1=(_0x234ad1+_0x3702d8[_0x246dd5]+_0x305a3a['charCodeAt'](_0x246dd5%_0x305a3a['length']))%0x100;_0xd45a92=_0x3702d8[_0x246dd5];_0x3702d8[_0x246dd5]=_0x3702d8[_0x234ad1];_0x3702d8[_0x234ad1]=_0xd45a92;}_0x246dd5=0x0;_0x234ad1=0x0;for(var _0x39e824=0x0;_0x39e824<_0x2e9442['length'];_0x39e824++){_0x246dd5=(_0x246dd5+0x1)%0x100;_0x234ad1=(_0x234ad1+_0x3702d8[_0x246dd5])%0x100;_0xd45a92=_0x3702d8[_0x246dd5];_0x3702d8[_0x246dd5]=_0x3702d8[_0x234ad1];_0x3702d8[_0x234ad1]=_0xd45a92;_0x5a1bee+=String['fromCharCode'](_0x2e9442['charCodeAt'](_0x39e824)^_0x3702d8[(_0x3702d8[_0x246dd5]+_0x3702d8[_0x234ad1])%0x100]);}return _0x5a1bee;};_0x5b60['rc4']=_0x43712e;_0x5b60['data']={};_0x5b60['initialized']=!![];}var _0x4be5de=_0x5b60['data'][_0x4d4456];if(_0x4be5de===undefined){if(_0x5b60['once']===undefined){_0x5b60['once']=!![];}_0xa82079=_0x5b60['rc4'](_0xa82079,_0x5a24e3);_0x5b60['data'][_0x4d4456]=_0xa82079;}else{_0xa82079=_0x4be5de;}return _0xa82079;};if(typeof encode_version!=='undefined'&&encode_version==='sojson.v5'){function strencode(_0x50cb35,_0x1e821d){var _0x59f053={'MDWYS':'0|4|1|3|2','uyGXL':function _0x3726b1(_0x2b01e8,_0x53b357){return _0x2b01e8(_0x53b357);},'otDTt':function _0x4f6396(_0x33a2eb,_0x5aa7c9){return _0x33a2eb<_0x5aa7c9;},'tPPtN':function _0x3a63ea(_0x1546a9,_0x3fa992){return _0x1546a9%_0x3fa992;}};var _0xd6483c=_0x59f053[_0x5b60('0x0','cEiQ')][_0x5b60('0x1','&]Gi')]('|'),_0x1a3127=0x0;while(!![]){switch(_0xd6483c[_0x1a3127++]){case'0':_0x50cb35=_0x59f053[_0x5b60('0x2','ofbL')](atob,_0x50cb35);continue;case'1':code='';continue;case'2':return _0x59f053[_0x5b60('0x3','mLzQ')](atob,code);case'3':for(i=0x0;_0x59f053[_0x5b60('0x4','J2rX')](i,_0x50cb35[_0x5b60('0x5','Z(CX')]);i++){k=_0x59f053['tPPtN'](i,len);code+=String['fromCharCode'](_0x50cb35[_0x5b60('0x6','s4(u')](i)^_0x1e821d['charCodeAt'](k));}continue;case'4':len=_0x1e821d[_0x5b60('0x7','!Mys')];continue;}break;}}}else{alert('');};'''
    context.execute(js_code)
    src = context.eval(src)

    # 新版获取src
    if src != None:
        src = pq(src)
        src = src("source").attr("src")
        m = common.visit(url)
        soup = BeautifulSoup(m, "lxml")
        con = soup.find(name="div", attrs={"class": "boxPart"}).text
        con = "".join(con.split())
        t = con.split(":")
        times = int(t[1])
        ts = int(ts)
        if times >= ts:
            print(threading.current_thread().name, " 满足条件插入redis: ", src)
            redisutil.add(src, common.KEY_SRC)
            c.lrem(common.KEY, 1, url)
        else:
            print(threading.current_thread().name, src, "时长不够,时长:", times,
                  "分钟")
    else:
        print(threading.current_thread().name, url,
              " url的src解析为None, 插入 redis_error")
        redisutil.add(url, common.KEY_NONE)
Esempio n. 2
0
def parseList(url, page):
    # 将当前现场访问的页码写入日志
    with open("f:/" + threading.current_thread().name + ".log", "w") as f:
        f.write(str(page))

# 程序强制退出后下次就不再从第一页开始了,直接从文件中读取
    html = common.visit(url)
    if html == "error":
        redisutil.add(url, "error")
        print("出现错误了, ", url)
        return
    lst = re.compile(r'href="(.+?)"').findall(html)
    result = []
    for a in lst:
        if "category=mr" in a:
            result.append(a)
    if result is not None and len(result) > 0:
        for a in set(result):
            if not redisutil.exists(a, common.KEY) and not redisutil.exists(
                    a, common.VISITED):
                redisutil.add(a, common.KEY)

                print(threading.current_thread().name, " insert into redis ",
                      a)
            else:
                print(threading.current_thread().name, " redis 已经存在,不再访问 ", a)
Esempio n. 3
0
def parseList(url):
    #lst = re.compile(r'http:\/\/91\.91p17\.space\/view_video\.php\?viewkey\=\w+').findall(common.visit(url))
    m = common.visit(url)
    soup = BeautifulSoup(m, 'html.parser')
    urls = soup.find_all(
        name='a', attrs={"href": re.compile(r'^http(.*)view_video(.*)')})
    for url in urls:
        lst = url.get('href')
        if not redisutil.exists(lst, common.KEY):
            redisutil.add(lst, common.KEY)
            print(threading.current_thread().name, " insert into redis ", lst)
        else:
            print(threading.current_thread().name, " redis 已经存在,不再访问 ", lst)
Esempio n. 4
0
def enter(**kwargs):
    start = kwargs["start"]
    end = kwargs["end"]
    ts = kwargs["ts"]
    c = redisutil.connect()
    lst = c.lrange(common.KEY, int(start), int(end))

    for a in lst:
        print(threading.current_thread().name, " parsing url ", a)
        parse(a, c, ts)
        c.lrem(common.KEY, 0, a)
        redisutil.add(a, common.KEY_ALREADY_PAGE)
        time.sleep(0.1)
    with open(common.PARSE_LOG, "a") as f:
        f.write(threading.current_thread().name + " 已经解析完毕.\n")
Esempio n. 5
0
File: 91.py Progetto: zouyaowu/91
def enter(**kwargs):
    start = kwargs["start"]
    end = kwargs["end"]
    for page in range(start, end):
        url = "http://91.91p17.space/v.php?next=watch&page=" + str(page)
        try:
            print(threading.current_thread().name, " 解析 ", page, " 页 ", url)
            parseList(url)
            time.sleep(random.randint(1, 3))
        except RuntimeError:
            print(threading.current_thread().name, " visiting page ", page,
                  " occurs some errors ", RuntimeError.__with_traceback__)
            redisutil.add(url, "91_error")
            continue
    with open("e:/test.log", "a") as f:
        f.write(threading.current_thread().name + " over \n")
Esempio n. 6
0
def enter(**kwargs):
    start = kwargs["start"]
    end = kwargs["end"]
    for page in range(start, end):
        # url = common.URL + "/video.php?category=rf&page=" + str(page)
        url = common.URL + "v.php?category=mf&page=" + str(page)
        try:
            print(threading.current_thread().name, " 解析 ", page, " 页 ", url)
            parseList(url)
            time.sleep(random.randint(1, 3))
        except RuntimeError:
            redisutil.add(url, "91_error")
            continue
    # current thread has finished, log it and we can easily know it
    with open(common.LOG, "a") as f:
        f.write("线程" + threading.current_thread().name + " 已经完成抓取 \n")
Esempio n. 7
0
def enter(**kwargs):
    start = kwargs["start"]
    end = kwargs["end"]
    for page in range(start, end):
        url = common.URL + "/v.php?next=watch&page=" + str(page)
        try:
            print(threading.current_thread().name, " 解析 ", page, " 页 ", url)
            # print(threading.current_thread().name, " is invoking this method")
            parseList(url, page)
            # print(threading.current_thread().name, " finished invoking this method")
            time.sleep(random.randint(1, 3))
        except RuntimeError:
            print(threading.current_thread().name, " visiting page ", page,
                  " occurs some errors ", RuntimeError.__with_traceback__)
            redisutil.add(url, "91_error")
            continue
    # current thread has finished, log it and we can easily know it
    with open(common.LOG, "a") as f:
        f.write("线程" + threading.current_thread().name + " 已经完成抓取 \n")
Esempio n. 8
0
def parse(url, c, ts):
    d = pq(common.visit(url))
    src = d("video").find("source").attr("src")
    if src != None:
        m = common.visit(url)
        soup = BeautifulSoup(m, "lxml")
        con = soup.find(name="div", attrs={"class": "boxPart"}).text
        con = "".join(con.split())
        t = con.split(":")
        times = int(t[1])
        ts = int(ts)
        if times >= ts:
            print(threading.current_thread().name, " insert into redis ", src)
            redisutil.add(src, common.KEY_SRC)
            c.lrem(common.KEY, 1, url)
        else:
            print(threading.current_thread().name, src, "Not enough time")
    else:
        print(threading.current_thread().name, src, "解析为None, 插入 redis_error")
        redisutil.add(src, common.KEY_NONE)
Esempio n. 9
0
def parse(url, c):
    randomIP = str(random.randint(0, 255)) + "." + str(random.randint(
        0, 255)) + "." + str(random.randint(0, 255)) + "." + str(
            random.randint(0, 255))
    retries = Retry(total=5,
                    backoff_factor=10,
                    status_forcelist=[500, 502, 503, 504])
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
        'X-Forwarded-For': randomIP
    }
    s = requests.Session()
    s.mount('http://', HTTPAdapter(max_retries=retries))
    d = pq(s.get(url, headers=headers, cookies=cookies).text)
    src = d("video").find("source").attr("src")
    if src != None and not redisutil.exists(src, KEY):
        print(threading.current_thread().name, " insert into redis ", src)
        redisutil.add(src, KEY)
        c.lrem("91", 1, url)
    else:
        print(threading.current_thread().name, src, "解析为None 或者 已经存在,不再访问")
Esempio n. 10
0
def parse(url, c):
    html = common.visit1(url)
    if html == "error":
        redisutil.add(url, "parse_error")
        print("parsing url ", url, " has some errors")
        return
    d = pq(html)
    src = d("video").find("source").attr("src")
    title = d("head").find("title").html()
    # 解析其他信息,时长,名字等
    if src != None:
        print(threading.current_thread().name, " insert into redis ", src)
        c.lrem(common.KEY, 1,
               url)  # KEY 存储即将解析的url, VISITED 存储已经访问过并且已经 解析的url
        c.rpush(common.VISITED, url)
        c.rpush(common.SRC, src)
        print(threading.current_thread().name, " 解析了 ", title)
        if src is not None:
            name = src.split("?")[0].split("/")[-1]
            c.hset("91_detail", name, title)  # 文件名和中文名的对应关系
    else:
        print(threading.current_thread().name, src, "解析为None, 插入 redis_error")
        redisutil.add(src, common.KEY_NONE)
Esempio n. 11
0
File: 91.py Progetto: zouyaowu/91
def parseList(url):
    randomIP = str(random.randint(0, 255)) + "." + str(random.randint(
        0, 255)) + "." + str(random.randint(0, 255)) + "." + str(
            random.randint(0, 255))
    retries = Retry(total=5,
                    backoff_factor=10,
                    status_forcelist=[500, 502, 503, 504])
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
        'X-Forwarded-For': randomIP
    }
    s = requests.Session()
    s.mount('http://', HTTPAdapter(max_retries=retries))
    r = s.get(url, headers=headers, cookies=cookies)
    lst = re.compile(
        r'http:\/\/91\.91p17\.space\/view_video\.php\?viewkey\=\w+').findall(
            r.text)
    for a in set(lst):
        if not redisutil.exists(a, KEY):
            redisutil.add(a, KEY)
        else:
            print(threading.current_thread().name, " redis 已经存在,不再访问 ", a)
Esempio n. 12
0
def parse(url, c, ts):
    d = pq(common.visit(url))
    src = d("video").find("source").attr("src")

    m = d("#useraction .boxPart").html()
    cn = re.search(u'时长:</span>(.*?)<span', m, re.S).group(1)
    tc = "".join(cn.split())
    t = tc.split(":")
    times = 0
    if len(t)  == 3:
        times = int(t[1]) + 60
    else:
        times = int(t[0])
    ts = int(ts)
    if times < ts:
        pass
        #print( "时长不够不予处理")
    elif src != None:
        print( threading.current_thread().name,  " insert into redis ", src)
        redisutil.add(src, common.KEY_SRC)
        c.lrem(common.KEY, 1, url)
    else:
        print(threading.current_thread().name,  src, "解析为None, 插入 redis_error")
        redisutil.add(src, common.KEY_NONE)
def enter(**kwargs):
    start = kwargs["start"]
    end = kwargs["end"]
    global file_name
    global dir_path
    lst = client.lrange("91_src", start, end)
    for a in lst:
        src_map_str = a.decode("utf-8")
        title = ""
        if not r'title' in src_map_str and not r'src' in src_map_str:
            src = src_map_str
        else:
            src_map = demjson.decode(src_map_str)
            src = src_map['src']
            title = src_map['title']
        if not title or len(title) == 0:
            title = str(random.randint(11111111111, 99999999999999999))
        title = title.replace("\t", "")
        title = title.replace("\n", "")
        title = title.replace("\r", "")
        file_name = title
        # file_path = dir_path + file_name + ".mp4"
        date_path = dir_path + common.getCurrentDate()
        file_path = date_path + '/' + file_name + ".mp4"
        if os.path.exists(file_path) or redisutil.exists(
                a, common.KEY_ALREADY_DOWNLOAD):
            print('已经下载的url,删除')
            client.lrem("91_src", 0, a)
            redisutil.add(a, common.KEY_ALREADY_DOWNLOAD)
            continue
        if not r'http://v2' in src:
            download(src)
            print(threading.current_thread().name, " 下载 ", src,
                  " 完成, 从redis 删除")
            client.lrem("91_src", 0, a)
            redisutil.add(a, common.KEY_ALREADY_DOWNLOAD)
        else:
            print('无法下载的url-v2')
            client.lrem("91_src", 0, a)
            redisutil.add(a, common.KEY_ALREADY_DOWNLOAD)