Beispiel #1
0
 def construct_headers(self):
     header = {
         "Referer": "http://linsen.fengj.com/",
         "Upgrade-Insecure-Requests": "1",
         "User-Agent": UAPool().get()
     }
     return header
Beispiel #2
0
    def get_contact_info(self, url):
        """
        获取公司信息
        :param url:
        :param cookies:
        :return:
        """
        session = requests.session()
        session.verify = False
        # session.headers = {
        #         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36"
        # }
        # session.headers = {
        #         "User-Agent": UAPool().get()
        # }
        items = url.split('/')
        del items[3]
        host = items[2]
        items[1] = "//"
        refer = items[0] + items[1] + items[2]

        session.headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Connection": "keep-alive",
            "Host": host,
            "Referer": refer,
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": UAPool().get()
        }

        jar = RequestsCookieJar()
        with open('cookies.txt', 'r') as file:
            cookies = json.load(file)
            for cookie in cookies:
                jar.set(cookie['name'], cookie['value'])

        proxies = {
            "http": "http://" + IPool().get_proxy()
        }

        response = session.get(url, cookies=jar, proxies=proxies, timeout=5)
        # response = session.get(url, cookies=jar)
        page = response.text
        print("|=========================================================================================================|")
        # print(page)
        html = etree.HTML(page)
        company_info = html.xpath('//*[@class="contact"]//text()')
        contact_info_picture_url = html.xpath('//*[@class="contact"]/div/ul/li/img/@src|//*[@class="contact"]/div/p/img/@src')
        # print("-------------------------------------------------------------------------")
        # print(items, '\n', contact_info_picture_url)
        # print("-------------------------------------------------------------------------")
        return company_info, contact_info_picture_url
Beispiel #3
0
 def construct_headers(self):
     headers1 = []
     for i in range(1, 93):
         header = {
             "Referer":
             "http://www.fengj.com/so/SearchInfo.aspx?keyword=%b7%cf%d6%bd&info_type=sell&page={}"
             .format(i),
             "Upgrade-Insecure-Requests":
             "1",
             "User-Agent":
             UAPool().get()
         }
         headers1.append(header)
     return headers1
Beispiel #4
0
 def __init__(self):
     self.host = '127.0.0.1'
     self.port = 27017
     self.conn = MongoClient(host=self.host, port=self.port)
     self.driver = webdriver.Chrome()
     self.base_url = "https://www.b2b168.com/k-waimaofushi/l-{}.html"
     self.headers = {
         "Accept":
         "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
         # "Accept-Encoding": "gzip, deflate",
         "Accept-Encoding": "gzip, deflate, br",
         "Accept-Language": "zh-CN,zh;q=0.9",
         "Connection": "keep-alive",
         "Upgrade-Insecure-Requests": "1",
         "User-Agent": UAPool().get(),
         "Host": "www.b2b168.com"
     }
Beispiel #5
0
 def get_company_url(self, url):
     """
     获取公司详情页联系方式的url
     :param url:
     :return:
     """
     try:
         session = requests.session()
         session.verify = False
         session.headers = {
             "User-Agent": UAPool().get()
         }
         # session.headers = {
         #     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
         #     "Accept-Encoding": "gzip, deflate",
         #     "Accept-Language": "zh-CN,zh;q=0.9",
         #     "Connection": "keep-alive",
         #     "Host": "ouyanglingfeng.feijiu.net",
         #     "Referer": "http://ouyanglingfeng.feijiu.net/",
         #     "Upgrade-Insecure-Requests": "1",
         #     "User-Agent": UAPool().get()
         # }
         jar = RequestsCookieJar()
         with open('cookies.txt', 'r') as file:
             cookies = json.load(file)
             for cookie in cookies:
                 jar.set(cookie['name'], cookie['value'])
         # proxies = {
         #     "http": "http://" + IPool().get_proxy(),
         # }
         # response = session.get(url, cookies=jar, proxies=proxies, timeout=10)
         response = session.get(url, cookies=jar)
         content = response.text
         html = etree.HTML(content)
         company_url_list = list()
         url_list = html.xpath('//*[@class="pro_lists"]/div/div/h2/a/@href')
         del url_list[0]
         for url in url_list:
             company_url = url + "/contactusNews.aspx"
             print(company_url)
             company_url_list.append(company_url)
         return company_url_list
     except Exception as e:
         print(e)
         pass
Beispiel #6
0
    def construct(self):
        url_list = []
        headers2 = []
        for i in range(2, 94):
            url = "http://www.fengj.com/so/SearchInfo.aspx?keyword=%b7%cf%d6%bd&info_type=sell&page={}".format(
                i)
            url_list.append(url)
            header = {
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate",
                "Accept-Language": "zh-CN,zh;q=0.9",
                "Cache-Control": "max-age=0",
                "Host": "www.fengj.com",
                "Proxy-Connection": "keep-alive",
                "Referer": url,
                "Upgrade-Insecure-Requests": "1",
                "User-Agent": UAPool().get()
            }
            headers2.append(header)

        return url_list, headers2