url_for_proxy_test = ' ' # connect_dict = {'host': '10.118.187.12', 'user': '******', 'passwd': 'admin', 'charset': 'utf8'} connect_dict = {'host': 'localhost', 'user': '******', 'passwd': '', 'charset': 'utf8'} # db_server = DBService(dbName=db_name, tableName=table_name, **connect_dict) # proxy_list = map(lambda x: x[0], db_server.getData(var='proxy_port', distinct=True)) # for p in proxy_list: # qu_proxy_test.put(p) patt_ip = re.compile(r'(?<![\.\d])(?:\d{1,3}\.){3}\d{1,3}(?![\.\d])') proxy_list = [] for table_name in table_name_s.split(','): print table_name db_server = DBService(dbName=db_name, tableName=table_name, **connect_dict) if db_server.isTableExist(): proxy_list += map(lambda x: x[0], db_server.getData(var='proxy_port')) proxy_list_t=list(set(proxy_list)) for p in proxy_list_t: qu_proxy_test.put(p) def original_ip_address(): t = requests.get('http://httpbin.org/ip').text return json.loads(t).get('origin') original = original_ip_address()
sys.setdefaultencoding('utf8') # config_text db_name = 'platform_data' table_name = 'suning' table_title = 'product_url,catalogue,sub_catalogue,product_title,promotion_desc,origin_price,price,' \ 'product_stars,comment_count,sending_service,other_service,product_params,shop_name,' \ 'shop_href,product_rating,product_rating_avg,serice_rating,service_rating_avg,express_rating,' \ 'express_rating_avg,com_name_tel,crawl_time' url_start = 'http://www.suning.com/emall/pgv_10052_10051_1_.html' # start url for crawl,string connect_dict = {'host': '10.118.187.12', 'user': '******', 'passwd': 'admin', 'charset': 'utf8'} # script db_server = DBService(dbName=db_name, tableName=table_name, **connect_dict) if not db_server.isTableExist(): db_server.createTable(tableTitle=table_title.split(',')) class Handler(BaseHandler): crawl_config = { } @every(minutes=24 * 60) def on_start(self): self.crawl(url_start, callback=self.step_first) @config(age=2 * 24 * 60 * 60) def step_first(self, response): d = response.doc for t in d('.listLeft>dl>dd>span>a').items():
db_name = 'address' table_name_1 = 'ershoufang_58city_baseinfo' table_name_2 = 'ershoufang_58city_detail' table_title_1 = 'detail,crawl_time' table_title_2 = 'url,detail,crawl_time' url_start = 'http://www.58.com/ershoufang/changecity/' # connect string , usually no need to modify connect_dict = {'host': '10.118.187.12', 'user': '******', 'passwd': 'admin', 'charset': 'utf8'} db_server_1 = DBService(dbName=db_name, tableName=table_name_1, **connect_dict) db_server_2 = DBService(dbName=db_name, tableName=table_name_2, **connect_dict) # if create table for store result in mysql , no need to be changed if not db_server_1.isTableExist(): db_server_1.createTable(tableTitle=table_title_1.split(',')) if not db_server_2.isTableExist(): db_server_2.createTable(tableTitle=table_title_2.split(',')) pat_num = re.compile('\d+') pat_replace_space = re.compile('\s+?') pat_comment = re.compile('var arr=(.+?)\;') class Handler(BaseHandler): crawl_config = { # 'proxy': '10.10.10.10:80', 'headers': { 'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) '