Exemple #1
0
def request_detail(keyword):
    list_jd = []
    list_1688 = []
    list_b2bbaidu = []
    tasks_list_result = []
    url = "http://www.baidu.com/s?wd=" + keyword
    try:
        r = requests.get(url, headers=Header, timeout=6)
    except Exception as e:
        logging.info("eeeee:{}".format(e))

    else:
        # zloop = asyncio.set_event_loop(zloop)
        zloop = asyncio.get_event_loop()
        tasks = []
        tasks_result = []
        if r:
            tasks_list = []
            # logging.info("请求关键字成功:{}".format(url))
            print("请求关键字成功:{}".format(url))
            selector = getXpath(r.text)
            node_list = selector.xpath('''//div[contains(@class,"c-container")]''')

            for l in node_list:
                url_baidu_detail = l.xpath('''h3/a[1]/@href''')
                lingshi_title = l.xpath('''string(h3/a[1])''')
                title = str(lingshi_title)
                if len(url_baidu_detail) > 0:
                    url_baidu_detail = url_baidu_detail[0]
                    if url_baidu_detail == "" or len(url_baidu_detail) < 3:
                        continue
                    if str(url_baidu_detail).startswith("/sf/"):
                        continue
                    index_id = 0
                    index_id = l.xpath("@id")
                    if len(index_id) > 0:
                        index_id = int(index_id[0])
                    # asyncio.ensure_future(fetch(url_baidu_detail,title,keyword,index_id))
                    tasks.append(asyncio.ensure_future(fetch(url_baidu_detail, title, keyword, index_id)))
            zloop.run_until_complete(asyncio.wait(tasks))



        for val in tasks:
            values = val.result()
            print("xjl")

            print(values)
            print("xjl2")
Exemple #2
0
def requests_baidu_keyword(q):
    # flag = True
    # while flag:
    keyword = q
    tasks_list = []
    url = "http://www.baidu.com/s?wd=" + keyword
    r = requests.get(url, headers=Header)
    if r:
        logging.info("请求关键字成功:{}".format(url))
        selector = getXpath(r.text)
        node_list = selector.xpath('''//div[contains(@class,"c-container")]''')
        global flag_jd
        flag_jd = {}
        global flag_1688
        flag_1688 = {}
        global flag_b2bbaidu
        flag_b2bbaidu = {}
        for l in node_list:
            url_baidu_detail = l.xpath('''h3/a[1]/@href''')
            lingshi_title = l.xpath('''string(h3/a[1])''')
            title = str(lingshi_title)
            if len(url_baidu_detail) > 0:
                url_baidu_detail = url_baidu_detail[0]
                if url_baidu_detail == "" or len(url_baidu_detail) < 3:
                    continue
                if str(url_baidu_detail).startswith("/sf/"):
                    continue
                index_id = 0
                index_id = l.xpath("@id")
                if len(index_id) > 0:
                    index_id = int(index_id[0])
                tasks_list.append(
                    gevent.spawn(fetch, url_baidu_detail, title, keyword,
                                 index_id))
        gevent.joinall(tasks_list)

        # flag = False

    print(flag_jd)
    print(flag_1688)
    print(flag_b2bbaidu)
Exemple #3
0
def fetch(url, title, keyword, index_id):
    dicts_list = []
    try:
        r2 = requests.get(url, headers=Header, timeout=5)
    except Exception as e:
        logging.info("eeeee2:{}".format(e))
    else:
        if r2 and r2.status_code < 400:
            print(r2.url)
            if r2.apparent_encoding == "utf-8" or r2.apparent_encoding.startswith(
                    "UTF-8") or r2.apparent_encoding == "utf8":
                r2.encoding = "utf-8"
            elif r2.apparent_encoding == "GB2312" or r2.apparent_encoding.startswith(
                    "ISO-8859") or r2.apparent_encoding.startswith("Windows"):
                r2.encoding = "gbk"

            if supervisory[0] in r2.url:
                logging.info("关键字{}-- jd 详情页url:{}".format(keyword, r2.url))
                if "..." in title:
                    html = getXpath(r2.text)
                    title = html.xpath('''//head/title/text()''')
                    if len(title) > 0:
                        title = title[0]

                # dicts_list = []
                dicts = {}
                # dicts[str(index_id)] = {"tag": "jd", "title": title, "url": r2.url}
                dicts["tag"] = "jd"
                dicts["keyword"] = keyword
                dicts["title"] = title
                dicts["url"] = r2.url
                dicts["index"] = index_id
                return dicts
            if supervisory[1] in r2.url:
                logging.info("关键字{}-- 1688 详情页url:{}".format(keyword, r2.url))
                if "..." in title:
                    html = getXpath(r2.text)
                    title = html.xpath('''//head/title/text()''')
                    if len(title) > 0:
                        title = title[0]
                # global flag_1688
                # flag_1688[str(index_id)] = {"tag":"1688","title": title, "url": r2.url}
                dicts = {}
                dicts["index"] = index_id
                dicts["tag"] = "1688"
                dicts["keyword"] = keyword
                dicts["title"] = title
                dicts["url"] = r2.url
                return dicts

            if supervisory[2] in r2.url:
                logging.info("关键字{}-- b2bbaidu 详情页url:{}".format(
                    keyword, r2.url))
                if "..." in title:
                    html = getXpath(r2.text)
                    title = html.xpath('''//head/title/text()''')
                    if len(title) > 0:
                        title = title[0]

                flag_b2bbaidu[str(index_id)] = {
                    "tag": "b2bbaidu",
                    "title": title,
                    "url": r2.url
                }
                dicts = {}
                dicts["tag"] = "b2b.baidu.com"
                dicts["keyword"] = keyword
                dicts["title"] = title
                dicts["url"] = r2.url
                dicts["index"] = index_id
                return dicts
Exemple #4
0
def request_detail(keyword):
    list_jd = []
    list_1688 = []
    list_b2bbaidu = []
    tasks_list_result = []
    jd = True
    s_1688 = True
    b2bbaidu = True
    url = "http://www.baidu.com/s?wd=" + keyword
    try:
        r = requests.get(url, headers=Header, timeout=6)
    except Exception as e:
        logging.info("eeeee:{}".format(e))
    else:

        if r:
            tasks_list = []
            logging.info("请求关键字成功:{}".format(url))
            selector = getXpath(r.text)
            node_list = selector.xpath(
                '''//div[contains(@class,"c-container")]''')
            for l in node_list:
                url_baidu_detail = l.xpath('''h3/a[1]/@href''')
                lingshi_title = l.xpath('''string(h3/a[1])''')
                title = str(lingshi_title)
                if len(url_baidu_detail) > 0:
                    url_baidu_detail = url_baidu_detail[0]
                    if url_baidu_detail == "" or len(url_baidu_detail) < 3:
                        continue
                    if str(url_baidu_detail).startswith("/sf/"):
                        continue
                    index_id = 0
                    index_id = l.xpath("@id")
                    if len(index_id) > 0:
                        index_id = int(index_id[0])
                    tasks_list.append(
                        pool_task.spawn(fetch, url_baidu_detail, title,
                                        keyword, index_id))
                    gevent.joinall(tasks_list)

            for tasks_result in tasks_list:
                result = tasks_result.value
                if result:
                    if result["tag"] == "jd":
                        list_jd.append(result)
                    if result["tag"] == "1688":
                        list_1688.append(result)
                    if result["tag"] == "b2b.baidu.com":
                        list_b2bbaidu.append(result)

            if len(list_jd) > 0:
                s = sorted(list_jd, key=lambda x: x['index'], reverse=False)
                dict_jd = s[0]
                tasks_list_result.append(dict_jd)
            if len(list_1688) > 0:
                s = sorted(list_1688, key=lambda x: x['index'], reverse=False)
                dict_1688 = s[0]
                tasks_list_result.append(dict_1688)

            if len(list_b2bbaidu) > 0:
                s = sorted(list_b2bbaidu,
                           key=lambda x: x['index'],
                           reverse=False)
                dict_b2bbaidu = s[0]
                tasks_list_result.append(dict_b2bbaidu)
            # for dict_param in tasks_list:
            #     executor_thread.submit(insert_db, dict_param)
    # print(tasks_list_result)
        return tasks_list_result
Exemple #5
0
def requests_baidu_keyword(q):
    while True:
        keyword = q.get()
        flag_jd = False
        flag_1688 = False
        flag_b2bbaidu = False
        url = "http://www.baidu.com/s?wd=" + keyword
        r = requests.get(url, headers=Header)
        if r:
            logging.info("请求关键字成功:{}".format(url))
            selector = getXpath(r.text)
            node_list = selector.xpath('''//div[contains(@class,"c-container")]''')
            for l in node_list:
                url_baidu_detail = l.xpath('''h3/a[1]/@href''')
                lingshi_title = l.xpath('''string(h3/a[1])''')
                title = str(lingshi_title)
                if len(url_baidu_detail) > 0:
                    url_baidu_detail = url_baidu_detail[0]
                    if url_baidu_detail == "" or len(url_baidu_detail) < 3:
                        continue
                    if str(url_baidu_detail).startswith("/sf/"):
                        continue
                    try:
                        r2 = requests.get(url_baidu_detail, headers=Header, verify=False, timeout=8)
                    except Exception as e:
                        logging.warning(e)
                    else:
                        if r2:
                            if r2.apparent_encoding == "utf-8" or r2.apparent_encoding.startswith(
                                    "UTF-8") or r2.apparent_encoding == "utf8":
                                r2.encoding = "utf-8"
                            elif r2.apparent_encoding == "GB2312" or r2.apparent_encoding.startswith(
                                    "ISO-8859") or r2.apparent_encoding.startswith("Windows"):
                                r2.encoding = "gbk"
                            if r2:
                                print(r2.url)
                                if supervisory[0] in r2.url and flag_jd == False:
                                    flag_jd = True
                                    index_id = 0
                                    index_id = l.xpath("@id")
                                    if len(index_id) > 0:
                                        index_id = int(index_id[0])
                                    logging.info("关键字{}-- jd 详情页url:{}".format(keyword, r2.url))
                                    dicts = {}
                                    dicts["tag"] = "jd"
                                    dicts["keyword"] = keyword
                                    if "..." in title:
                                        html = getXpath(r2.text)
                                        title = html.xpath('''//head/title/text()''')
                                        if len(title) > 0:
                                            title = title[0]
                                    dicts["title"] = title
                                    dicts["url"] = r2.url
                                    dicts["index"] = index_id
                                    insert_db(dicts)
                                if supervisory[1] in r2.url and flag_1688 == False:
                                    flag_1688 = True
                                    index_id = 0
                                    index_id = l.xpath("@id")
                                    if len(index_id) > 0:
                                        index_id = int(index_id[0])
                                    logging.info("关键字{}-- 1688 详情页url:{}".format(keyword, r2.url))
                                    dicts = {}
                                    dicts["tag"] = "1688"
                                    dicts["keyword"] = keyword
                                    if "..." in title:
                                        html = getXpath(r2.text)
                                        title = html.xpath('''//head/title/text()''')
                                        if len(title) > 0:
                                            title = title[0]
                                    dicts["title"] = title
                                    dicts["url"] = r2.url
                                    dicts["index"] = index_id
                                    insert_db(dicts)
                                if supervisory[2] in r2.url and flag_b2bbaidu == False:
                                    flag_b2bbaidu = True
                                    index_id = 0
                                    index_id = l.xpath("@id")
                                    if len(index_id) > 0:
                                        index_id = int(index_id[0])
                                    logging.info("关键字{}-- b2bbaidu 详情页url:{}".format(keyword, r2.url))
                                    dicts = {}
                                    dicts["tag"] = "b2b.baidu.com"
                                    dicts["keyword"] = keyword
                                    if "..." in title:
                                        html = getXpath(r2.text)
                                        title = html.xpath('''//head/title/text()''')
                                        if len(title) > 0:
                                            title = title[0]

                                    dicts["title"] = title
                                    dicts["url"] = r2.url
                                    dicts["index"] = index_id
                                    insert_db(dicts)
Exemple #6
0
# -*-coding: utf-8-*-

# **************************file desc*****************************
__author__ = 'yushanshan'
# createTime : 2019/7/18 19:54
# desc : this is new py file, please write your desc for this file
# ****************************************************************
from requests_url import getXpath
import requests

Header = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36",
}
r = requests.get("https://www.aizhan.com/cha/xinnet.com/",
                 headers=Header,
                 verify=False)

html = getXpath(r.text)
Exemple #7
0
def fetch(url, title, keyword, index_id):
    r2 = requests.get(url, headers=Header)
    if r2 and r2.status_code < 400:
        print(r2.url)
        if r2.apparent_encoding == "utf-8" or r2.apparent_encoding.startswith(
                "UTF-8") or r2.apparent_encoding == "utf8":
            r2.encoding = "utf-8"
        elif r2.apparent_encoding == "GB2312" or r2.apparent_encoding.startswith(
                "ISO-8859") or r2.apparent_encoding.startswith("Windows"):
            r2.encoding = "gbk"

        if supervisory[0] in r2.url:
            logging.info("关键字{}-- jd 详情页url:{}".format(keyword, r2.url))
            if "..." in title:
                html = getXpath(r2.text)
                title = html.xpath('''//head/title/text()''')
                if len(title) > 0:
                    title = title[0]
            global flag_jd
            flag_jd[str(index_id)] = {
                "tag": "jd",
                "title": title,
                "url": r2.url
            }
            # dicts = {}
            # dicts["tag"] = "jd"
            # dicts["keyword"] = keyword
            # dicts["title"] = title
            # dicts["url"] = r2.url
            # dicts["index"] = index_id

        if supervisory[1] in r2.url:
            logging.info("关键字{}-- 1688 详情页url:{}".format(keyword, r2.url))
            if "..." in title:
                html = getXpath(r2.text)
                title = html.xpath('''//head/title/text()''')
                if len(title) > 0:
                    title = title[0]
            global flag_1688
            flag_1688[str(index_id)] = {
                "tag": "1688",
                "title": title,
                "url": r2.url
            }
            # dicts = {}
            # dicts["index"] = index_id
            # dicts["tag"] = "1688"
            # dicts["keyword"] = keyword
            # dicts["title"] = title
            # dicts["url"] = r2.url

        if supervisory[2] in r2.url:
            logging.info("关键字{}-- b2bbaidu 详情页url:{}".format(keyword, r2.url))
            if "..." in title:
                html = getXpath(r2.text)
                title = html.xpath('''//head/title/text()''')
                if len(title) > 0:
                    title = title[0]
            global flag_b2bbaidu
            flag_b2bbaidu[str(index_id)] = {
                "tag": "b2bbaidu",
                "title": title,
                "url": r2.url
            }
Exemple #8
0
def request_detail(keyword):
    list_jd = []
    list_1688 = []
    list_b2bbaidu = []
    tasks_list_result = []
    url = "http://www.baidu.com/s?wd=" + keyword
    try:
        r = requests.get(url, headers=Header, timeout=6)
    except Exception as e:
        logging.info("eeeee:{}".format(e))

    else:
        # zloop = asyncio.set_event_loop(zloop)
        zloop = asyncio.get_event_loop()
        tasks = []
        tasks_result = []
        if r:
            tasks_list = []
            # logging.info("请求关键字成功:{}".format(url))
            print("请求关键字成功:{}".format(url))
            selector = getXpath(r.text)
            node_list = selector.xpath(
                '''//div[contains(@class,"c-container")]''')

            for l in node_list:
                url_baidu_detail = l.xpath('''h3/a[1]/@href''')
                lingshi_title = l.xpath('''string(h3/a[1])''')
                title = str(lingshi_title)
                if len(url_baidu_detail) > 0:
                    url_baidu_detail = url_baidu_detail[0]
                    if url_baidu_detail == "" or len(url_baidu_detail) < 3:
                        continue
                    if str(url_baidu_detail).startswith("/sf/"):
                        continue
                    index_id = 0
                    index_id = l.xpath("@id")
                    if len(index_id) > 0:
                        index_id = int(index_id[0])
                    # asyncio.ensure_future(fetch(url_baidu_detail,title,keyword,index_id))
                    tasks.append(
                        asyncio.ensure_future(
                            fetch(url_baidu_detail, title, keyword, index_id)))
            zloop.run_until_complete(asyncio.wait(tasks))

        for val in tasks:
            values = val.result()
            # print(values[0])
            if values[0] != "1":
                title = values[2]
                if "..." in title:
                    html = getXpath(values[0])
                    title = html.xpath('''//head/title/text()''')
                    if len(title) > 0:
                        title = title[0]
                dicts = {}
                dicts["tag"] = values[5]
                dicts["keyword"] = values[3]
                dicts["title"] = title
                dicts["url"] = values[1]
                dicts["index"] = values[4]

                if dicts["tag"] == "jd":
                    list_jd.append(dicts)
                if dicts["tag"] == "1688":
                    list_1688.append(dicts)
                if dicts["tag"] == "b2b.baidu.com":
                    list_b2bbaidu.append(dicts)
            else:
                # logging.info("error2 stop "+str(values[1]))
                print("error2 stop " + str(values[1]))

        if len(list_jd) > 0:
            s = sorted(list_jd, key=lambda x: x['index'], reverse=False)
            dict_jd = s[0]
            tasks_list_result.append(dict_jd)
        if len(list_1688) > 0:
            s = sorted(list_1688, key=lambda x: x['index'], reverse=False)
            dict_1688 = s[0]
            tasks_list_result.append(dict_1688)

        if len(list_b2bbaidu) > 0:
            s = sorted(list_b2bbaidu, key=lambda x: x['index'], reverse=False)
            dict_b2bbaidu = s[0]
            tasks_list_result.append(dict_b2bbaidu)
        # for d in tasks_list_result:
        #     insert_db(d)
        print(tasks_list_result)
Exemple #9
0
def requests_baidu_keyword(keyword):
    # global q
    # print(111111111111111111111)
    # flag = True
    # while flag:
    # keyword = q.get()
    executor = ThreadPoolExecutor(max_workers=16)
    tasks_list = []
    list_jd = []
    list_1688 = []
    list_b2bbaidu = []
    url = "http://www.baidu.com/s?wd=" + keyword
    try:
        r = requests.get(url, headers=Header, timeout=6)
    except Exception as e:
        logging.info("eeeee:{}".format(e))
    else:
        if r:

            logging.info("请求关键字成功:{}".format(url))
            selector = getXpath(r.text)
            node_list = selector.xpath(
                '''//div[contains(@class,"c-container")]''')
            for l in node_list:
                url_baidu_detail = l.xpath('''h3/a[1]/@href''')
                lingshi_title = l.xpath('''string(h3/a[1])''')
                title = str(lingshi_title)
                if len(url_baidu_detail) > 0:
                    url_baidu_detail = url_baidu_detail[0]
                    if url_baidu_detail == "" or len(url_baidu_detail) < 3:
                        continue
                    if str(url_baidu_detail).startswith("/sf/"):
                        continue
                    index_id = 0
                    index_id = l.xpath("@id")
                    if len(index_id) > 0:
                        index_id = int(index_id[0])
                    # tasks_list.append(gevent.spawn(fetch, url_baidu_detail, title, keyword, index_id))
                    # gevent.joinall(tasks_list)
                    future = executor.submit(fetch, url_baidu_detail, title,
                                             keyword, index_id)
                    if future.result():
                        lingshi = future.result()
                        if lingshi["tag"] == "jd":
                            list_jd.append(lingshi)
                        if lingshi["tag"] == "1688":
                            list_1688.append(lingshi)
                        if lingshi["tag"] == "b2b.baidu.com":
                            list_b2bbaidu.append(lingshi)

            if len(list_jd) > 0:
                s = sorted(list_jd, key=lambda x: x['index'], reverse=False)
                dict_jd = s[0]
                tasks_list.append(dict_jd)
            if len(list_1688) > 0:
                s = sorted(list_1688, key=lambda x: x['index'], reverse=False)
                dict_1688 = s[0]
                tasks_list.append(dict_1688)

            if len(list_b2bbaidu) > 0:
                s = sorted(list_b2bbaidu,
                           key=lambda x: x['index'],
                           reverse=False)
                dict_b2bbaidu = s[0]
                tasks_list.append(dict_b2bbaidu)
            for dict_param in tasks_list:
                future = executor.submit(insert_db, dict_param)