Beispiel #1
0
def rt_parser(items): 
    pids = get_pids(items)
    if not pids:
        log_with_time("got nothing: %s" % entries)
        return
    purl = price_url % (",".join(["J_" + i for i in pids]), 
            random.randint(1000000, 10000000), int(time.time() * 1000)) 
    surl = stock_url % (async_http.quote(",".join([i for i in pids])), 
            random.randint(1000000, 10000000), int(time.time() * 1000)) 

    price_res = simple_http.get(purl) 
    stock_res = simple_http.get(surl)
    if price_res["status"] != 200 or stock_res["status"] != 200:
        log_with_time("not200: %s" % price["res"])
        return
    try:
        price_json = jsonp_json(price_res["text"]) 
        stock_json = jsonp_json(stock_res["text"].decode("gbk"))
    except: 
        traceback.print_exc()
        return
    prices = {} 
    for i in price_json: 
        prices[i["id"].split("_")[1]] = i["p"]
    stocks = {} 
    for k,v in stock_json.items(): 
        s = v["StockStateName"]
        if u"有货" in s or u"现货" in s:
            stocks[k] = 1
        else:
            stocks[k] = 0 
    ret = []
    for pid in prices:
        ret.append((str(pid), str(prices[pid]), stocks[pid])) 
    return format_price(ret)
Beispiel #2
0
def get_work_list(author_url):
    if not author_url:
        pdb.set_trace()
    h, c = simple_http.get(author_url, proxy=proxy)
    if h["status"] != 200:
        pdb.set_trace()
    t = etree.HTML(c)
    urls = t.xpath(work_last)
    total = 0
    ret = []
    if len(urls) == 0:
        total = 0
    elif len(urls) < 6:
        total = page_to_id(urls[-2].attrib["href"])
    else:
        total = page_to_id(urls[-1].attrib["href"])
    ret.extend(
        zip([x.attrib["src"] for x in t.xpath(work_title2)],
            [base + x.attrib["href"] for x in t.xpath(work_url)],
            [x.text for x in t.xpath(author_name)]))
    for i in range(2, total + 1):
        h, c = simple_http.get(author_url + "page=%d/" % i, proxy=proxy)
        t = etree.HTML(c)
        ret.extend(
            zip([x.attrib["src"] for x in t.xpath(work_title2)],
                [base + x.attrib["href"] for x in t.xpath(work_url)],
                [x.text for x in t.xpath(author_name)]))
    return ret
Beispiel #3
0
def get_work_list_digital(author_url):
    while True:
        h, c = simple_http.get(author_url, proxy=proxy) 
        if h["status"] == 200:
            break 
    t = etree.HTML(c) 
    urls = t.xpath(p_work_last) 
    total = 0
    ret = []
    if len(urls) == 0:
        total = 0
    elif len(urls) < 6: 
        total = page_to_id(urls[-2].attrib["href"]) 
    else:
        total = page_to_id(urls[-1].attrib["href"]) 
    ret.extend(zip([x.attrib["alt"] for x in t.xpath(p_work_title)],
        [x.attrib["href"] for x in t.xpath(p_work_url)])) 
    for i in range(2, total+1): 
        while True:
            h, c = simple_http.get(author_url + "page=%d/" % i, proxy=proxy) 
            if h["status"] == 200:
                break
        t = etree.HTML(c)
        ret.extend(zip([x.attrib["alt"] for x in t.xpath(p_work_title)],
            [x.attrib["href"] for x in t.xpath(p_work_url)])) 
    return ret 
Beispiel #4
0
def get_work_list(author_url):
    if not author_url:
        pdb.set_trace()
    h, c = simple_http.get(author_url, proxy=proxy) 
    if h["status"] != 200:
        pdb.set_trace()
    t = etree.HTML(c)
    urls = t.xpath(work_last) 
    total = 0
    ret = []
    if len(urls) == 0:
        total = 0
    elif len(urls) < 6: 
        total = page_to_id(urls[-2].attrib["href"]) 
    else:
        total = page_to_id(urls[-1].attrib["href"]) 
    ret.extend(zip([x.attrib["src"] for x in t.xpath(work_title2)],
        [base+x.attrib["href"] for x in t.xpath(work_url)], 
        [x.text for x in t.xpath(author_name)])) 
    for i in range(2, total+1): 
        h, c = simple_http.get(author_url + "page=%d/" % i, proxy=proxy)
        t = etree.HTML(c)
        ret.extend(zip([x.attrib["src"] for x in t.xpath(work_title2)],
            [base+x.attrib["href"] for x in t.xpath(work_url)], 
            [x.text for x in t.xpath(author_name)])) 
    return ret
Beispiel #5
0
def down_one(title, url):
    h, c = simple_http.get(url, proxy=proxy)
    if not h:
        pdb.set_trace()
    if h["status"] != 200:
        pdb.set_trace()
    t = etree.HTML(c)
    pics = t.xpath(image_xpath)
    torrent = [x for x in t.xpath(torrent_xpath) if x.attrib["href"].startswith("attach")] 
    for i,v in enumerate(pics):
        name = ("%s-%d.jpg" % (title, i)).replace("/", "-")
        if os.path.exists(name):
            continue
        if not "jpg" in v.attrib["src"]:
            continue
        h, c = simple_http.get(v.attrib["src"], proxy=proxy) 
        if h["status"] != 200:
            if h["status"] == 302:
                h, c = simple_http.get(h["Location"], proxy=proxy)
                if h["status"] != 200:
                    pdb.set_trace()
        f = open(name, "w+")
        f.write(c)
        f.close()
    for i,v in enumerate(torrent):
        name = ("%s-%d.torrent" % (title, i)).replace("/", "-")
        if os.path.exists(name):
            continue
        h, c = simple_http.get(base+v.attrib["href"], proxy=proxy)
        if h["status"] != 200:
            pdb.set_trace()
        f = open(name, "w+")
        f.write(c)
        f.close()
Beispiel #6
0
def get_work_list_digital(author_url):
    while True:
        h, c = simple_http.get(author_url, proxy=proxy)
        if h["status"] == 200:
            break
    t = etree.HTML(c)
    urls = t.xpath(p_work_last)
    total = 0
    ret = []
    if len(urls) == 0:
        total = 0
    elif len(urls) < 6:
        total = page_to_id(urls[-2].attrib["href"])
    else:
        total = page_to_id(urls[-1].attrib["href"])
    ret.extend(
        zip([x.attrib["alt"] for x in t.xpath(p_work_title)],
            [x.attrib["href"] for x in t.xpath(p_work_url)]))
    for i in range(2, total + 1):
        while True:
            h, c = simple_http.get(author_url + "page=%d/" % i, proxy=proxy)
            if h["status"] == 200:
                break
        t = etree.HTML(c)
        ret.extend(
            zip([x.attrib["alt"] for x in t.xpath(p_work_title)],
                [x.attrib["href"] for x in t.xpath(p_work_url)]))
    return ret
Beispiel #7
0
def down_one(title, url):
    h, c = simple_http.get(url, proxy=proxy)
    if not h:
        return
    if h["status"] != 200:
        return
    t = etree.HTML(c)
    pics = t.xpath(image_xpath)
    torrent = [
        x for x in t.xpath(torrent_xpath)
        if x.attrib["href"].startswith("attach")
    ]
    for i, v in enumerate(pics):
        name = ("%s-%d.jpg" % (title, i)).replace("/", "-")
        if os.path.exists(name):
            continue
        if not "jpg" in v.attrib["src"]:
            continue
        try:
            h, c = simple_http.get(v.attrib["src"], proxy=proxy)
        except socket.timeout:
            continue
        except socket.error:
            continue
        if h["status"] != 200:
            if h["status"] == 302:
                location = h["Location"]
                if not location.startswith("http"):
                    url = {}
                    url["host"] = simple_http.urlparse(v.attrib["src"])["host"]
                    url["path"] = location
                    location = simple_http.generate_url(url)
                h, c = simple_http.get(location, proxy=proxy)
                if h["status"] != 200:
                    continue
        if len(c) < 10240:
            continue
        try:
            f = open(name.decode("utf-8"), "wb+")
        except IOError as e:
            print e
            continue
        f.write(c)
        f.close()
    for i, v in enumerate(torrent):
        name = ("%s-%d.torrent" % (title, i)).replace("/", "-")
        if os.path.exists(name):
            continue
        h, c = simple_http.get(base + v.attrib["href"], proxy=proxy)
        if h["status"] != 200:
            continue
        try:
            f = open(name.decode("utf-8"), "wb+")
        except IOError as e:
            print e
            continue
        f.write(c)
        f.close()
Beispiel #8
0
def task_get_http():
    run_http("get_http")
    try:
        os.wait()
    except KeyboardInterrupt:
        pass
    payload = {"t1": "v1", "t2": "v2"}
    simple_http.get("127.0.0.1:6711", query=payload)
    pdb.set_trace()
Beispiel #9
0
def download_link(name, link): 
    h, c = simple_http.get(link, header=simple_http.download_header)
    if h["status"] != 200:
        if h["status"] == 302:
            h, c = simple_http.get(h["Location"], header=simple_http.download_header)
            if h["status"] != 200:
                pdb.set_trace() 
    f = open(name, "w+")
    f.write(c)
    f.close() 
Beispiel #10
0
def down_one(title, url):
    h, c = simple_http.get(url, proxy=proxy, timeout=20, cookie=cookie)
    if not h:
        return
    if h["status"] != 200:
        return
    t = etree.HTML(c) 
    pics = t.xpath(img_xpath) + t.xpath(img2_xpath) 
    torrent = [x for x in t.xpath(torrent_xpath) if x.attrib["href"].startswith("attach")] 
    for i,v in enumerate(pics):
        name = ("%s-%d.jpg" % (title, i)).replace("/", "-")
        if os.path.exists(name):
            continue
        if not "jpg" in v.attrib["src"]:
            continue
        try:
            h, c = simple_http.get(v.attrib["src"], proxy=proxy) 
        except socket.timeout:
            continue
        except socket.error:
            continue
        if h["status"] != 200:
            if h["status"] == 302: 
                location = h["Location"]
                if not location.startswith("http"): 
                    url = {}
                    url["host"] = simple_http.urlparse(v.attrib["src"])["host"]
                    url["path"] = location
                    location = simple_http.generate_url(url)
                h, c = simple_http.get(location, proxy=proxy) 
                if h["status"] != 200:
                    continue
        if len(c) < 10240:
            continue
        try:
            f = open(name, "wb+")
        except IOError as e:
            print e
            continue
        f.write(c)
        f.close()
    for i,v in enumerate(torrent):
        name = ("%s-%d.torrent" % (title, i)).replace("/", "-") 
        if os.path.exists(name):
            continue 
        h, c = simple_http.get(base+v.attrib["href"], proxy=proxy, cookie=cookie, timeout=20)
        if h["status"] != 200:
            continue
        try:
            f = open(name, "wb+")
        except IOError as e:
            print e
            continue
        f.write(c)
        f.close() 
Beispiel #11
0
def download_link(name, link):
    h, c = simple_http.get(link, header=simple_http.download_header)
    if h["status"] != 200:
        if h["status"] == 302:
            h, c = simple_http.get(h["Location"],
                                   header=simple_http.download_header)
            if h["status"] != 200:
                pdb.set_trace()
    f = open(name, "wb+")
    f.write(c)
    f.close()
Beispiel #12
0
def task_get_http(): 
    run_http("get_http") 
    try:
        os.wait()
    except KeyboardInterrupt:
        pass 
    payload = {
            "t1": "v1",
            "t2": "v2"
            } 
    simple_http.get("127.0.0.1:6711", query = payload)
    pdb.set_trace()
Beispiel #13
0
def get_info(work_url): 
    h, c = simple_http.get(work_url, proxy=proxy)
    t = etree.HTML(c) 
    tags = t.xpath(skip_xpath)
    for i in tags:
        if i.text == skip:
            return {}
    try: 
        cover_img = t.xpath(cover)[0].attrib["href"]
    except:
        cover_img = ""
    pics = [replace(bigpic, x.attrib["src"], "jp") for x in t.xpath(photo)] 
    try:
        name = t.xpath(title)[0].text
    except:
        name = ""
    try: 
        id = t.xpath(aid)[-2].text 
    except:
        id = ""
    return {
            "cover": cover_img,
            "pics": pics,
            "name": name,
            "aid": id
            }
Beispiel #14
0
def get_category_dvd():
    h, c = simple_http.get(dvd, proxy=proxy) 
    t = etree.HTML(c)
    urls = t.xpath(proun)
    for i in urls: 
        print utf8(i.text)
        get_author_list(base+i.attrib["href"])
Beispiel #15
0
def sis_login(user, password):
    h, c = simple_http.get(base + "logging.php?action=login",
                           proxy=proxy,
                           timeout=10)
    t = etree.HTML(c.decode("gbk"))
    formhash = t.xpath("/html/body/div[4]/div[1]/div[4]/div[1]/form/input[1]"
                       )[0].attrib["value"].encode("utf-8")
    referer = t.xpath("/html/body/div[4]/div[1]/div[4]/div[1]/form/input[2]"
                      )[0].attrib["value"].encode("utf-8")
    payload = {
        "formhash": formhash,
        "referer": base + referer,
        "cookietime": "2592000",
        "loginfield": "username",
        "username": user,
        "password": password,
        "questionid": "",
        "answer": "",
        "loginsubmit": "true"
    }
    h, c = simple_http.post(base + "logging.php?action=login",
                            proxy=proxy,
                            payload=payload,
                            timeout=10)
    return simple_http.client_cookie(h["Set-Cookie"])
Beispiel #16
0
def get_category_dvd():
    h, c = simple_http.get(dvd, proxy=proxy)
    t = etree.HTML(c)
    urls = t.xpath(proun)
    for i in urls:
        print utf8(i.text)
        get_author_list(base + i.attrib["href"])
Beispiel #17
0
def get_page(t, i):
    url = "%sforum-%s-%d.html" % (base, t, i)
    h, c = simple_http.get(url, cookie=cookie, proxy=proxy, timeout=10)
    t = etree.HTML(c.decode("gbk"))
    al = t.xpath(p_xpath) + t.xpath(p2_xpath)
    for i, v in enumerate(al):
        print "[%d/%d] %s" % (i + 1, len(al), v.text)
        get_content(v.text, v.attrib["href"])
Beispiel #18
0
def do(url, n):
    h, content = simple_http.get(url % n)
    if h["status"] != 200:
        pdb.set_trace()
    s = etree.HTML(content)
    videos = s.xpath(xpath)
    for i in videos:
        print "%s\n%s" % (i.attrib["title"], i.attrib["href"])
Beispiel #19
0
def down(url, output, proxy=""):
    res = simple_http.get(url, proxy=proxy, header=simple_http.download_header)
    if res["status"] != 200:
        pprint.pprint(h)
        exit(1)
    f = open(output, "wb+")
    f.write(res["text"])
    f.close()
Beispiel #20
0
def get_page(t, i): 
    url = "%sforum-%s-%d.html" % (base, t, i)
    h, c = simple_http.get(url, cookie = cookie, proxy=proxy, timeout=10)
    t = etree.HTML(c.decode("gbk")) 
    al = t.xpath(p_xpath)+t.xpath(p2_xpath)
    for i,v in enumerate(al):
        print "[%d/%d] %s" % (i+1, len(al), v.text)
        get_content(v.text, v.attrib["href"])
Beispiel #21
0
def down(url, output, proxy=""):
    h, c = simple_http.get(url, proxy=proxy, header=simple_http.download_header)
    if h["status"] != 200:
        pprint.pprint(h)
        exit(1)
    f = open(output, "w+")
    f.write(c)
    f.close()
Beispiel #22
0
def do(url, n):
    h, content = simple_http.get(url % n)
    if h["status"] != 200:
        pdb.set_trace()
    s = etree.HTML(content) 
    videos = s.xpath(xpath) 
    for i in videos:
        print "%s\n%s" % (i.attrib["title"],  i.attrib["href"])
Beispiel #23
0
def down(url, output, proxy=""):
    h, c = simple_http.get(url,
                           proxy=proxy,
                           header=simple_http.download_header)
    if h["status"] != 200:
        pprint.pprint(h)
        exit(1)
    f = open(output, "wb+")
    f.write(c)
    f.close()
Beispiel #24
0
def down_one(title, url):
    h, c = simple_http.get(url, proxy=proxy)
    if not h:
        pdb.set_trace()
    if h["status"] != 200:
        pdb.set_trace()
    t = etree.HTML(c)
    pics = t.xpath(image_xpath)
    torrent = [x for x in t.xpath(torrent_xpath) if x.attrib["href"].startswith("attach")] 
    for i,v in enumerate(pics):
        name = ("%s-%d.jpg" % (title, i)).replace("/", "-")
        if os.path.exists(name):
            continue
        if not "jpg" in v.attrib["src"]:
            continue
        try:
            h, c = simple_http.get(v.attrib["src"], proxy=proxy) 
        except socket.timeout:
            continue
        if h["status"] != 200:
            if h["status"] == 302: 
                location = h["Location"]
                if not location.startswith("http"): 
                    url = {}
                    url["host"] = simple_http.urlparse(v.attrib["src"])["host"]
                    url["path"] = location
                    location = simple_http.generate_url(url)
                h, c = simple_http.get(location, proxy=proxy) 
                if h["status"] != 200:
                    pdb.set_trace()
        f = open(name, "w+")
        f.write(c)
        f.close()
    for i,v in enumerate(torrent):
        name = ("%s-%d.torrent" % (title, i)).replace("/", "-")
        if os.path.exists(name):
            continue
        h, c = simple_http.get(base+v.attrib["href"], proxy=proxy)
        if h["status"] != 200:
            pdb.set_trace()
        f = open(name, "w+")
        f.write(c)
        f.close() 
Beispiel #25
0
def get_author_list(category):
    h, c = simple_http.get(category, proxy=proxy)
    t = etree.HTML(c)
    urls = t.xpath(last)
    total = 0
    if len(urls) == 0:
        total = 0
    elif len(urls) < 6:
        total = page_to_id(urls[-2].attrib["href"])
    else:
        total = page_to_id(urls[-1].attrib["href"])
    for i in t.xpath(p):
        print utf8(i.xpath("string()"))
        print utf8(base + i.attrib["href"])
    for i in range(2, total + 1):
        h, c = simple_http.get(category + "/page=%d" % i, proxy=proxy)
        t = etree.HTML(c)
        for i in t.xpath(p):
            print utf8(i.xpath("string()"))
            print utf8(base + i.attrib["href"])
Beispiel #26
0
def get_author_list(category): 
    h, c = simple_http.get(category, proxy=proxy)
    t = etree.HTML(c)
    urls = t.xpath(last)
    total = 0
    if len(urls) == 0:
        total = 0
    elif len(urls) < 6: 
        total = page_to_id(urls[-2].attrib["href"]) 
    else:
        total = page_to_id(urls[-1].attrib["href"]) 
    for i in t.xpath(p): 
        print utf8(i.xpath("string()"))
        print utf8(base+i.attrib["href"])
    for i in range(2, total+1):
        h, c = simple_http.get(category + "/page=%d" % i, proxy=proxy)
        t = etree.HTML(c)
        for i in t.xpath(p):
            print utf8(i.xpath("string()"))
            print utf8(base+i.attrib["href"])
Beispiel #27
0
def redis_proxy_get():
    import simple_http
    redis_proxy_connect()
    res = simple_http.get("http://127.0.0.1:8866/redis_proxy",
                          query={
                              "node": "test",
                              "type": "list",
                              "key": "jd_page",
                              "batch": "10"
                          })
    pdb.set_trace()
    assert res["status"] == 200
Beispiel #28
0
def redis_proxy_connect():
    import simple_http
    import json
    res = simple_http.get("http://127.0.0.1:8866/connect",
                          query={
                              "node": "test",
                              "db": json.dumps({
                                  "host": "127.0.0.1",
                                  "port": 6379
                              })
                          })
    assert res["status"] == 200
Beispiel #29
0
def get_categroy_digital():
    p1 = ["", "k", "s", "t", "n", "h", "m", "y", "r", "w"]
    p2 = ["a", "i", "u", "e", "o"]
    pages = []
    for i in p1:
        pages.extend(["%s=/keyword=%s/" % (digital_base, i + x) for x in p2])
    authors = []
    for i in pages:
        h, c = simple_http.get(i, proxy=proxy)
        t = etree.HTML(c)
        for i in t.xpath(p_digital) + t.xpath(ps_digital):
            print utf8(i.xpath("string()"))
            print utf8(i.attrib["href"])
Beispiel #30
0
def get_categroy_digital(): 
    p1 = ["", "k", "s", "t", "n", "h", "m", "y", "r", "w"]
    p2 = ["a", "i", "u", "e", "o"]
    pages = []
    for i in p1:
        pages.extend(["%s=/keyword=%s/" % (digital_base, i+x) for x in p2]) 
    authors = [] 
    for i in pages:
        h, c = simple_http.get(i, proxy=proxy)
        t = etree.HTML(c) 
        for i in t.xpath(p_digital) + t.xpath(ps_digital): 
            print utf8(i.xpath("string()"))
            print utf8(i.attrib["href"])
Beispiel #31
0
def down_pic(name, pic_url):
    print name, pic_url
    if os.path.exists(name):
        return 
    h, c = simple_http.get(pic_url, proxy=proxy)
    if h["status"] != 200:
        print "skip", pic_url
        return
    try:
        f = open(name, "wb+") 
    except IOError:
        f = open(name.split("-")[-1], "wb+")
    f.write(c)
    f.close() 
Beispiel #32
0
def down_pic(name, pic_url):
    print name, pic_url
    if os.path.exists(name):
        return
    h, c = simple_http.get(pic_url, proxy=proxy)
    if h["status"] != 200:
        print "skip", pic_url
        return
    try:
        f = open(name, "wb+")
    except IOError:
        f = open(name.split("-")[-1], "wb+")
    f.write(c)
    f.close()
Beispiel #33
0
def rt_parser(items):
    pids = get_pids(items)
    if not pids:
        log_with_time("got nothing: %s" % entries)
        return
    purl = price_url % (",".join([
        "J_" + i for i in pids
    ]), random.randint(1000000, 10000000), int(time.time() * 1000))
    surl = stock_url % (async_http.quote(",".join([
        i for i in pids
    ])), random.randint(1000000, 10000000), int(time.time() * 1000))

    price_res = simple_http.get(purl)
    stock_res = simple_http.get(surl)
    if price_res["status"] != 200 or stock_res["status"] != 200:
        log_with_time("not200: %s" % price["res"])
        return
    try:
        price_json = jsonp_json(price_res["text"])
        stock_json = jsonp_json(stock_res["text"].decode("gbk"))
    except:
        traceback.print_exc()
        return
    prices = {}
    for i in price_json:
        prices[i["id"].split("_")[1]] = i["p"]
    stocks = {}
    for k, v in stock_json.items():
        s = v["StockStateName"]
        if u"有货" in s or u"现货" in s:
            stocks[k] = 1
        else:
            stocks[k] = 0
    ret = []
    for pid in prices:
        ret.append((str(pid), str(prices[pid]), stocks[pid]))
    return format_price(ret)
Beispiel #34
0
def download_album(aid): 
    header, content = simple_http.get("http://music.baidu.com/album/%s" % aid)
    if header["status"] != 200:
        print "failed"
        print header
        exit(1)
    t = etree.HTML(content)
    songs = []
    for i in t.xpath(ALBUM_XPATH):
        songs.append((i.attrib["title"], i.attrib["href"])) 
    for i, v in enumerate(songs):
        link = download_by_id(v[1].strip("/song/"))
        if not link:
            continue
        print "===================\n[%d/%d]: %s\n%s" % (i+1, len(songs), link[0], link[1]) 
Beispiel #35
0
def down_page(tid, pid): 
    h, c = simple_http.get(thread_base % (tid, pid), proxy=proxy)
    if h["status"] != 200:
        pdb.set_trace()
    t = etree.HTML(c) 
    url = []
    if pid == 1:
        a = t.xpath(p1_xpath)                 
        b = t.xpath(p1_special)
        url.extend(a)
        url.extend(b)
    else: 
        url.extend(t.xpath(pn_xpath)) 
    for i,v in enumerate(url):
        print "[p%d-%d]: %s" % (pid, i+1, v.text)
        down_one(v.text.encode("utf-8"), base + v.attrib["href"]) 
Beispiel #36
0
def download_album(aid):
    header, content = simple_http.get("http://music.baidu.com/album/%s" % aid)
    if header["status"] != 200:
        print "failed"
        print header
        exit(1)
    t = etree.HTML(content)
    songs = []
    for i in t.xpath(ALBUM_XPATH):
        songs.append((i.attrib["title"], i.attrib["href"]))
    for i, v in enumerate(songs):
        link = download_by_id(v[1].strip("/song/"))
        if not link:
            continue
        print "===================\n[%d/%d]: %s\n%s" % (i + 1, len(songs),
                                                        link[0], link[1])
Beispiel #37
0
def get_songids(uid):
    ret = []
    for i in range(0xffff):
        query = {"start": str(i * 25), "ting_uid": uid, "order": "hot"}
        h, c = simple_http.get("http://music.baidu.com/data/user/getsongs",
                               query=query)
        if h["status"] != 200:
            break
        t = json.loads(c)["data"]["html"]
        tree = etree.HTML(t)
        result = [x for x in tree.xpath(SONGID_XPATH)]
        if not result:
            break
        for i in result:
            if not "class" in i.attrib:
                ret.append((i.attrib["title"],
                            ID_PATTERN.match(i.attrib["href"]).groups()[0]))
    return ret
Beispiel #38
0
def sis_login(user, password):
    h, c = simple_http.get(base+"logging.php?action=login", proxy=proxy, timeout=10)
    t = etree.HTML(c.decode("gbk"))
    formhash = t.xpath("/html/body/div[4]/div[1]/div[4]/div[1]/form/input[1]")[0].attrib["value"].encode("utf-8")
    referer = t.xpath("/html/body/div[4]/div[1]/div[4]/div[1]/form/input[2]")[0].attrib["value"].encode("utf-8")
    payload = {
            "formhash": formhash,
            "referer":  base+referer,
            "cookietime": "2592000",
            "loginfield": "username",
            "username": user,
            "password": password,
            "questionid": "",
            "answer": "",
            "loginsubmit": "true"
            } 
    h, c = simple_http.post(base+ "logging.php?action=login", proxy=proxy, payload = payload, timeout=10)
    return simple_http.client_cookie(h["Set-Cookie"])
Beispiel #39
0
    def post(self):
        m = self.get_arguments("sfile")
        t = self.get_arguments("stype")
        e = self.get_arguments("sexpr")
        if not m or not t or not e:
            self.write("arguments not enough")
            return
        m = m[0]
        t = t[0]
        e = e[0]

        fn = None
        if not m.startswith("file://"):
            fn = "list.html"
            f = open("list.html", "w")
            res = simple_http.get(m)
            if res["status"] != 200:
                self.write("err, bad response. %r" % h["status"])
                return
            f.write(res["text"])
            f.close()
        else:
            fn = m[7:]
        e = e.encode("utf-8")
        p = ""
        if t == "xpath":
            p = ","
        elif t == "text":
            p = "-"
        elif t == "line":
            p = ">"
        elif t == "attr":
            p = "."
        try:
            rt = subprocess.check_output(["./qxt", fn, p + e],
                                         stderr=sys.stderr)
        except Exception as e:
            import traceback
            traceback.print_exc()
            self.write("call failed.")
            return
        self.write(rt)
Beispiel #40
0
def get_songids(uid): 
    ret = []
    for i in range(0xffff):
        query = {
                "start": str(i * 25),
                "ting_uid": uid,
                "order": "hot"
                } 
        h, c =simple_http.get("http://music.baidu.com/data/user/getsongs", query=query); 
        if h["status"] != 200:
            break
        t = json.loads(c)["data"]["html"] 
        tree = etree.HTML(t)
        result = [x for x in tree.xpath(SONGID_XPATH)]
        if not result:
            break 
        for i in result:
            if not "class" in i.attrib:
                ret.append((i.attrib["title"], i.attrib["href"].split("#")[0]))
    return ret
Beispiel #41
0
def down_page(tid, pid): 
    h, c = simple_http.get(thread_base % (tid, pid), proxy=proxy)
    if h["status"] != 200:
        return
    t = etree.HTML(c) 
    url = []
    if pid == 1:
        a = t.xpath(p1_xpath)                 
        b = t.xpath(p1_special)
        url.extend(a)
        url.extend(b)
    else: 
        url.extend(t.xpath(pn_xpath)) 
    for i,v in enumerate(url): 
        try:
            down_one(v.text.encode("utf-8"), base + v.attrib["href"]) 
        except OSError: 
            print "skip, [p%d-%d]: %s" % (pid, i+1, v.text)
            continue
        print "[p%d-%d]: %s" % (pid, i+1, v.text)
Beispiel #42
0
def down_page(tid, pid):
    h, c = simple_http.get(thread_base % (tid, pid), proxy=proxy)
    if h["status"] != 200:
        return
    t = etree.HTML(c)
    url = []
    if pid == 1:
        a = t.xpath(p1_xpath)
        b = t.xpath(p1_special)
        url.extend(a)
        url.extend(b)
    else:
        url.extend(t.xpath(pn_xpath))
    for i, v in enumerate(url):
        try:
            down_one(v.text.encode("utf-8"), base + v.attrib["href"])
        except OSError:
            print "skip, [p%d-%d]: %s" % (pid, i + 1, v.text)
            continue
        print "[p%d-%d]: %s" % (pid, i + 1, v.text)
Beispiel #43
0
    def post(self): 
        m = self.get_arguments("sfile")
        t = self.get_arguments("stype")
        e = self.get_arguments("sexpr")
        if not m or not t or not e:
            self.write("arguments not enough")
            return
        m = m[0]
        t = t[0]
        e = e[0]

        fn = None
        if not m.startswith("file://"):
            fn = "list.html"
            f = open("list.html", "w")
            res = simple_http.get(m)
            if res["status"] != 200:
                self.write("err, bad response. %r" % h["status"])
                return
            f.write(res["text"])
            f.close()
        else: 
            fn = m[7:]
        e= e.encode("utf-8") 
        p = ""
        if t == "xpath":
                p = ","
        elif t == "text":
                p = "-"
        elif t == "line":
                p = ">"
        elif t == "attr":
                p = "."
        try:
            rt = subprocess.check_output(["./qxt", fn, p+e], stderr=sys.stderr)
        except Exception as e :
            import traceback
            traceback.print_exc()
            self.write("call failed.")
            return
        self.write(rt)
Beispiel #44
0
def get_info(work_url):
    h, c = simple_http.get(work_url, proxy=proxy)
    t = etree.HTML(c)
    tags = t.xpath(skip_xpath)
    for i in tags:
        if i.text == skip:
            return {}
    try:
        cover_img = t.xpath(cover)[0].attrib["href"]
    except:
        cover_img = ""
    pics = [replace(bigpic, x.attrib["src"], "jp") for x in t.xpath(photo)]
    try:
        name = t.xpath(title)[0].text
    except:
        name = ""
    try:
        id = t.xpath(aid)[-2].text
    except:
        id = ""
    return {"cover": cover_img, "pics": pics, "name": name, "aid": id}
Beispiel #45
0
def get_content(title, url): 
    h, c = simple_http.get(base+url, cookie=cookie, proxy=proxy, timeout=10)
    t = etree.HTML(c) 
    down_one(title, base+url)
Beispiel #46
0
def get_content(title, url):
    h, c = simple_http.get(base + url, cookie=cookie, proxy=proxy, timeout=10)
    t = etree.HTML(c)
    down_one(title, base + url)
def getpage(url, nsecs=5):
    try:
        a = time.time()
        h, content = simple_http.get(url) 
    except Exception as e:
        raise Exception("request failed: %s error %s", (url, e)) 
    print "=========\npage done in %fs: %s\ntimeout: %ds\n=========" % (time.time() -a, url, nsecs) 
    try:
        t = etree.HTML(content) 
    except:
        print "fetch failed: %s" % url
        pprint.pprint(h)
        exit(1)
    urls = [] 
    host = simple_http.urlparse(url)["host"] 
    #find all script, img
    for i in etree_util.query_element(t, "[script,img,link]"): 
        attrib = i.attrib 
        if "href" in attrib: 
            url_dict = simple_http.urlparse(attrib["href"])
            if not url_dict["host"]:
                url_dict["host"] = host 
            urls.append(simple_http.generate_url(url_dict)) 
        if "src" in attrib: 
            url_dict = simple_http.urlparse(attrib["src"])
            if not url_dict["host"]:
                url_dict["host"] = host 
            urls.append(simple_http.generate_url(url_dict)) 
    #multiprocess get 
    pids = []
    for i in urls:
        pid = os.fork()
        if not pid: 
            try: 
                a = time.time()
                simple_http.get(i)
                print "url done in %fs %s" % (time.time() - a, i)
            except Exception as e:
                print "url failed: %s" % i 
                print "error %s" % e
                traceback.print_tb(sys.exc_info()[2])
            exit(0)
        else:
            pids.append(pid) 
    #wait children for nsecs 
    def clean_children(signum, frame): 
        for i in urls: 
            pid, _, _ = os.wait3(os.WNOHANG)
            if pid:
                del pids[pids.index(pid)]
        #kill them if they are still in progress
        for i in pids: 
            os.kill(i, signal.SIGKILL)        
        for i in pids:
            os.wait()
        print "request done, kill %d children" % len(pids)
    signal.setitimer(signal.ITIMER_REAL, nsecs)
    signal.signal(signal.SIGINT, clean_children)
    signal.signal(signal.SIGALRM, clean_children)
    #block
    time.sleep(0xffff)
Beispiel #48
0
def redis_proxy_connect():
    import simple_http
    import json
    res = simple_http.get("http://127.0.0.1:8866/connect", query={"node": "test", "db": json.dumps({"host": "127.0.0.1", "port": 6379})}) 
    assert res["status"] == 200
Beispiel #49
0
def redis_proxy_get(): 
    import simple_http 
    redis_proxy_connect()
    res = simple_http.get("http://127.0.0.1:8866/redis_proxy", query={"node": "test", "type": "list", "key": "jd_page", "batch": "10"})
    pdb.set_trace()
    assert res["status"] == 200
Beispiel #50
0
# -*- coding: utf-8 -*-  
import datetime
import PyRSS2Gen
import simple_http
import json

if __name__ == '__main__':
    header, content = simple_http.get('http://api.acfun.tv/apiserver/content/channel?orderBy=1&channelId=110&pageSize=20&pageNo=1')
    x_content = json.loads(content)
    items = []
    list = x_content['data']['page']['list']
    for i in range(0, len(list)):
        item_json = list[i];
        items.append(PyRSS2Gen.RSSItem(
                title=item_json['title'],
                link='http://www.acfun.tv/a/ac' + str(item_json['contentId']),
                description=item_json['description'],
                guid=PyRSS2Gen.Guid('http://www.acfun.tv/a/ac' + str(item_json['contentId'])),
                pubDate=datetime.datetime.now()))
    
    rss = PyRSS2Gen.RSS2(
        title='Acfun 文章区',
        link='http://www.acfun.tv/v/list110/index.htm',
        description='Acfun 文章区',
        lastBuildDate=datetime.datetime.now(),
        items=items)
    rss.write_xml(open("D:\\feed.xml", "w"), encoding="utf-8")