def construct_headers(self): header = { "Referer": "http://linsen.fengj.com/", "Upgrade-Insecure-Requests": "1", "User-Agent": UAPool().get() } return header
def get_contact_info(self, url): """ 获取公司信息 :param url: :param cookies: :return: """ session = requests.session() session.verify = False # session.headers = { # "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36" # } # session.headers = { # "User-Agent": UAPool().get() # } items = url.split('/') del items[3] host = items[2] items[1] = "//" refer = items[0] + items[1] + items[2] session.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Host": host, "Referer": refer, "Upgrade-Insecure-Requests": "1", "User-Agent": UAPool().get() } jar = RequestsCookieJar() with open('cookies.txt', 'r') as file: cookies = json.load(file) for cookie in cookies: jar.set(cookie['name'], cookie['value']) proxies = { "http": "http://" + IPool().get_proxy() } response = session.get(url, cookies=jar, proxies=proxies, timeout=5) # response = session.get(url, cookies=jar) page = response.text print("|=========================================================================================================|") # print(page) html = etree.HTML(page) company_info = html.xpath('//*[@class="contact"]//text()') contact_info_picture_url = html.xpath('//*[@class="contact"]/div/ul/li/img/@src|//*[@class="contact"]/div/p/img/@src') # print("-------------------------------------------------------------------------") # print(items, '\n', contact_info_picture_url) # print("-------------------------------------------------------------------------") return company_info, contact_info_picture_url
def construct_headers(self): headers1 = [] for i in range(1, 93): header = { "Referer": "http://www.fengj.com/so/SearchInfo.aspx?keyword=%b7%cf%d6%bd&info_type=sell&page={}" .format(i), "Upgrade-Insecure-Requests": "1", "User-Agent": UAPool().get() } headers1.append(header) return headers1
def __init__(self): self.host = '127.0.0.1' self.port = 27017 self.conn = MongoClient(host=self.host, port=self.port) self.driver = webdriver.Chrome() self.base_url = "https://www.b2b168.com/k-waimaofushi/l-{}.html" self.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", # "Accept-Encoding": "gzip, deflate", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "User-Agent": UAPool().get(), "Host": "www.b2b168.com" }
def get_company_url(self, url): """ 获取公司详情页联系方式的url :param url: :return: """ try: session = requests.session() session.verify = False session.headers = { "User-Agent": UAPool().get() } # session.headers = { # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", # "Accept-Encoding": "gzip, deflate", # "Accept-Language": "zh-CN,zh;q=0.9", # "Connection": "keep-alive", # "Host": "ouyanglingfeng.feijiu.net", # "Referer": "http://ouyanglingfeng.feijiu.net/", # "Upgrade-Insecure-Requests": "1", # "User-Agent": UAPool().get() # } jar = RequestsCookieJar() with open('cookies.txt', 'r') as file: cookies = json.load(file) for cookie in cookies: jar.set(cookie['name'], cookie['value']) # proxies = { # "http": "http://" + IPool().get_proxy(), # } # response = session.get(url, cookies=jar, proxies=proxies, timeout=10) response = session.get(url, cookies=jar) content = response.text html = etree.HTML(content) company_url_list = list() url_list = html.xpath('//*[@class="pro_lists"]/div/div/h2/a/@href') del url_list[0] for url in url_list: company_url = url + "/contactusNews.aspx" print(company_url) company_url_list.append(company_url) return company_url_list except Exception as e: print(e) pass
def construct(self): url_list = [] headers2 = [] for i in range(2, 94): url = "http://www.fengj.com/so/SearchInfo.aspx?keyword=%b7%cf%d6%bd&info_type=sell&page={}".format( i) url_list.append(url) header = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "max-age=0", "Host": "www.fengj.com", "Proxy-Connection": "keep-alive", "Referer": url, "Upgrade-Insecure-Requests": "1", "User-Agent": UAPool().get() } headers2.append(header) return url_list, headers2