class ProxyPoolScraper: def __init__(self, url, bs_parser="lxml"): self.parser = WebParser(url) self.bs_parser = bs_parser def get_proxy_stream(self, limit): raw_records = self.extract_table_raw_records() clean_records = list( map(self._clear_up_record, raw_records) ) for record in clean_records[:limit]: self.logger.info(f"Proxy record: {record}") if record: yield ProxyRecord(*record) def extract_table_raw_records(self): content = self.parser.get_content() soup_object = BeautifulSoup(content, self.bs_parser) return ( soup_object .find(id="list") .find_all("tr") ) def _clear_up_record(self, raw_record): return [ val.text for val in raw_record.find_all("td") ]
class ProxyPoolValidator: def __init__(self, url, timeout=10): self.timeout = timeout self.parser = WebParser(url, rotate_header=True) def validate_proxy(self, proxy_record): content = self.parser.get_content(timeout=self.timeout, proxies=proxy_record.proxy) proxy_status = ProxyStatus(proxy_record.proxy, content is not None) self.logger.info(f"Proxy status: {proxy_status}") return proxy_status
class NewsProducer: def __init__(self, rss_feed): self.parser = WebParser(rss_feed, rotate_header=True) self.formatter = NewsFormatter() def _extract_news_feed_items(self, proxies): content = self.parser.get_content(proxies=proxies) news_feed = atoma.parse_rss_bytes(content) return news_feed.items def get_news_stream(self, proxies): news_feed_items = self._extract_news_feed_items(proxies) for entry in news_feed_items: formatted_entry = self.formatter.format_entry(entry) yield formatted_entry
class ProxyPoolValidator: def __init__(self, url, timeout=10, checks=3, sleep_interval=0.1): self.timeout = timeout self.checks = checks self.sleep_interval = sleep_interval self.parser = WebParser(url, rotate_header=True) def validate_proxy(self, proxy_record): consecutive_checks = [] for _ in range(self.checks): content = self.parser.get_content(timeout=self.timeout, proxies=proxy_record.proxy) time.sleep(self.sleep_interval) consecutive_checks.append(int(content is not None)) health = sum(consecutive_checks) / self.checks proxy_status = ProxyStatus(proxy=proxy_record.proxy, health=health, is_valid=health > 0.66) self.logger.info(f"Proxy status: {proxy_status}") return proxy_status
def __init__(self, url, bs_parser="lxml"): self.parser = WebParser(url) self.bs_parser = bs_parser
def __init__(self, url, timeout=10, checks=3, sleep_interval=0.1): self.timeout = timeout self.checks = checks self.sleep_interval = sleep_interval self.parser = WebParser(url, rotate_header=True)
import json from bot import Bot from parser import WebParser import sys from message import PhoneMessage def get_amazon_links(): with open('data/links.json') as links_file: json_links = json.load(links_file) wipe_products_links = json_links['Wipes'] spray_products_links = json_links['Spray'] return wipe_products_links, spray_products_links if __name__ == "__main__": wipe_products_links,spray_products_links = get_amazon_links() web_parser = WebParser() bot = Bot(web_parser) bot.start_scrapying_process(wipe_products_links) bot.start_scrapying_process(spray_products_links) stocked_products_urls = bot.webparser.stocked_product_links #Couldnt find any products that were in stock if not stocked_products_urls: sys.exit(1) else: phone_message = PhoneMessage(stocked_products_urls) phone_message.send_message()
def __init__(self, url, timeout=10): self.timeout = timeout self.parser = WebParser(url, rotate_header=True)
def test__str__representation(url, expected): web_parser = WebParser(url) result = str(web_parser) assert result == expected
def web_parser(): yield WebParser(TEST_URL)
def __init__(self, rss_feed): self.parser = WebParser(rss_feed, rotate_header=True) self.formatter = NewsFormatter()
def __init__(self, keywords_dict, redis_key): #self.bloom_filter = BloomFilter(redis.StrictRedis(host='localhost', port=6379), 'job_url') self.parser = WebParser(redis_key) self.keywords = keywords_dict
class JobSpider(object): def __init__(self, keywords_dict, redis_key): #self.bloom_filter = BloomFilter(redis.StrictRedis(host='localhost', port=6379), 'job_url') self.parser = WebParser(redis_key) self.keywords = keywords_dict def crawl_zhilian(self, city, keyword): #url_list = [] # todo url_list 做成堆栈形式 begin_url = 'https://fe-api.zhaopin.com/c/i/sou?start={page}&pageSize=90&cityId={city}&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw={keyword}&kt=3' database = MongoDB('zhilian', self.keywords[keyword]) url_list = self._get_list(begin_url, city, keyword, page_weight=90) print(keyword, city, 'list parser done!') print(len(url_list)) self._get_content(database, url_list) def crawl_qiancheng(self, city, keyword): begin_url = 'https://search.51job.com/list/{city},000000,0000,00,9,99,{keyword},2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=' database = MongoDB('qiancheng', self.keywords[keyword]) url_list = self._get_list(begin_url, city, keyword, page_begin=1, web_name='qiancheng') print(keyword, city, 'list parser done!') if url_list: print(len(url_list)) self._get_content(database, url_list, web_name='qiancheng') def crawl_liepin(self, city, keyword): begin_url = "https://www.liepin.com/city-{city}/zhaopin/pn{page}/?d_pageSize=40&jobKind=2&key={keyword}" database = MongoDB('liepin', self.keywords[keyword]) url_list = self._get_list(begin_url, city, keyword, page_begin=0, web_name='liepin') print(keyword, city, 'list parser done!') if url_list: print(len(url_list)) self._get_content(database, url_list, web_name='liepin') def crawl_boss(self): pass def crawl_shixi(self): pass def crawl_lagou(self): pass def _anti_progrosse(self): # todo 反爬虫代理函数 proxy = get_proxy.get_proxy() if not proxy: proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy, } response = requests.get(begin_url.format(page * 90, city, keyword), headers=headers, proxies=proxies) if response.status_code != 200: print( 'proxy mode fail!!! please wait a few time, and try again') return urls = self.parser.list_zhilian(response.text) else: print("Can't seek useful proxy!") return def _get_content(self, database, url_list, web_name=None): # todo 此处改为多线程 if url_list: for url in url_list: try: response = requests.get(url, headers=headers) if response.status_code != 200: print('anti-spider in content: ', response.status_code) print('error url:', url) # todo 反爬虫代理函数 time.sleep(3) response = requests.get(url, headers=headers) if response.status_code != 200: print('give up:', url) else: if web_name == 'zhilian': self.parser.content_zhilian( response, database, url) if web_name == 'qiancheng': self.parser.content_qiancheng( response, database, url) if web_name == 'liepin': self.parser.content_liepin( response, database, url) continue if web_name == 'zhilian': self.parser.content_zhilian(response, database, url) if web_name == 'qiancheng': self.parser.content_qiancheng(response, database, url) if web_name == 'liepin': self.parser.content_liepin(response, database, url) except Exception as e: print('request_job_contain error : {}'.format(e)) def _get_list(self, begin_url, city, keyword, page_weight=1, page_begin=0, web_name=None): url_list = [] for page in range(1000): urls = [] try: u = begin_url.format(page=page * page_weight + page_begin, city=city, keyword=keyword) response = requests.get(begin_url.format( page=page * page_weight + page_begin, city=city, keyword=keyword), headers=headers) if response.status_code != 200: print('anti-spider in list') continue # 如果用下面的话,记得去掉这个return """ # # todo 反爬虫代理函数 # proxy = get_proxy.get_proxy() # if not proxy: # proxies = { # 'http': 'http://' + proxy, # 'https': 'https://' + proxy, # } # response = requests.get(begin_url.format(page*90, city, keyword), headers=headers, proxies=proxies) # if response.status_code != 200: # print('proxy mode fail!!! please wait a few time, and try again') # return # urls = self.parser.list_zhilian(response.text) # else: # print("Can't seek useful proxy!") # return """ else: if web_name == 'zhilian': urls = self.parser.list_zhilian(response) if web_name == 'qiancheng': urls = self.parser.list_qiancheng(response) if web_name == 'liepin': urls = self.parser.list_liepin(response) if urls == (None or []): break url_list.extend(urls) except Exception as e: print('request_job_list error : {}'.format(e)) return url_list