def crawl_street_list(): # page_url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/52/03/520324.html" # response = request_util(page_url,'gb18030'); # print response # return global district_code_list district_code_list = query_data(3) try: page_urls = generate_page_url() for k,page_item in enumerate(page_urls): page_url = page_item['page_url'] print page_url if (page_url in special_url_conn): for item in special_url: response = request_util(item['page_url'],item['encoding']); else: response = request_util(page_url,'gbk'); soup = BeautifulSoup(response, "lxml") info_list = soup.find('table',class_="towntable").find_all("tr",class_="towntr") for k,item in enumerate(info_list): if item.contents[0].find('a',{'href':True}): #street_url = street_url_prefix + item.contents[0].a.attrs['href'].encode('utf8') code = item.contents[0].a.get_text().encode('utf8') name = item.contents[1].a.get_text().encode('utf8') parent_code,parent_name = get_district_code(code) level = 4 print code, name, parent_code, parent_name insert_data(code, name, parent_code, parent_name, level) except Exception, e: print traceback.format_exc()
def crawl_community_list(): global street_code_list street_code_list = query_data(4) try: page_urls = generate_page_url() for k,page_item in enumerate(page_urls): page_url = page_item['page_url'] print page_url if (page_url in special_url_conn): for item in special_url: response = request_util(item['page_url'],item['encoding']); else: response = request_util(page_url,'gbk'); soup = BeautifulSoup(response, "lxml") info_list = soup.find('table',class_="villagetable").find_all("tr",class_="villagetr") for k,item in enumerate(info_list): #street_url = street_url_prefix + item.contents[0].a.attrs['href'].encode('utf8') code = item.contents[0].get_text().encode('utf8') name = item.contents[2].get_text().encode('utf8') parent_code,parent_name = get_street_code(code) level = 5 print code, name, parent_code, parent_name insert_data(code, name, parent_code, parent_name, level) except Exception, e: print traceback.format_exc()
def crawl_page(url): try: response = request_util(url) soup = BeautifulSoup(response, "html.parser") info_list = soup.find('ul', class_="subject-list").find_all('li') for item in info_list: print item.find( 'div', class_="info").find("h2").find("a").get_text().strip().replace( '\n', '').replace(' ', '').encode('utf8') print item.find("div", class_="star").find( "span", class_="rating_nums").get_text().strip() authors = item.find("div", class_="pub").get_text().strip().split("/") new_authors = [] for item in authors: new_authors.append(item.strip()) for i in range(-3, 0): new_authors.pop(i) print new_authors #print '/'.join(new_authors) except Exception, e: print traceback.format_exc()
def crawl_tags(): response = request_util(url) try: print response except Exception, e: print traceback.format_exc()
def crawl_province_list(): response = request_util(province_url) try: soup = BeautifulSoup(response, "lxml") info_list = soup.find('div', class_="TRS_PreAppend").find_all( "p", class_="MsoNormal") for k, item in enumerate(info_list): code_item = {} code = item.find("span", attrs={'lang': 'EN-US'}) code_item['code'] = code.get_text().strip() content_list = item.find_all("span") code_item['name'] = content_list[len(content_list) - 1].get_text().strip() code_list.append(code_item) for k, item in enumerate(code_list): if item['code'].find("0000") > 0: code = item['code'].encode('utf8') + "000000" name = item['name'].encode('utf8') parent_code = 0 level = 1 p_code_item = {} p_code_item['code'] = code p_code_item['name'] = name province_code_list.append(p_code_item) #insert_data(code, name, parent_code, level) except Exception, e: print traceback.format_exc()
def crawl_district_list(): global city_code_list city_code_list = query_data(2) try: page_urls = generate_page_url() for k, page_url in enumerate(page_urls): print page_url response = request_util(page_url, 'gbk') soup = BeautifulSoup(response, "lxml") info_list = soup.find('table', class_="countytable").find_all( "tr", class_="countytr") for k, item in enumerate(info_list): if item.contents[0].find('a', {'href': True}): #street_url = common_url_prefix + url_code + item.contents[0].a.attrs['href'].encode('utf8') code = item.contents[0].a.get_text().encode('utf8') name = item.contents[1].a.get_text().encode('utf8') parent_code, parent_name = get_city_code(code) level = 3 print code, name, parent_code, parent_name insert_data(code, name, parent_code, parent_name, level) #crawl_street_detail(street_url) except Exception, e: print traceback.format_exc()
def crawl_district_detail(url, url_code): response = request_util(url, 'gb2312') try: soup = BeautifulSoup(response, "lxml") info_list = soup.find('table', class_="countytable").find_all("tr", class_="countytr") for k, item in enumerate(info_list): code = item.contents[0].a.get_text().encode('utf8') name = item.contents[1].a.get_text().encode('utf8') d_code_item = {} d_code_item['code'] = code d_code_item['name'] = name district_code_list.append(d_code_item) for k, item in enumerate(info_list): if item.contents[0].find('a', {'href': True}): street_url = common_url_prefix + url_code + item.contents[ 0].a.attrs['href'].encode('utf8') code = item.contents[0].a.get_text().encode('utf8') name = item.contents[1].a.get_text().encode('utf8') parent_code, parent_name = get_city_code(code) level = 3 #print code, name, parent_code, parent_name insert_data(code, name, parent_code, parent_name, level) crawl_street_detail(street_url) except Exception, e: print traceback.format_exc()
def crawl_tags(): response = request_util(book_tags_url) try: soup = BeautifulSoup(response, "html.parser") info_list = soup.find_all('table', class_="tagCol") for item in info_list: content_list = item.find_all("a") for content in content_list: page_url_prefix = book_url + content.attrs['href'].encode( 'utf8') tag = page_url_prefix.split("tag/", 1)[1] crawl_book_list(page_url_prefix, tag) except Exception, e: print traceback.format_exc()
def crawl_city_detail(url): response = request_util(url, 'gb2312') try: soup = BeautifulSoup(response, "lxml") info_list = soup.find('table', class_="citytable").find_all("tr", class_="citytr") for k, item in enumerate(info_list): code = item.contents[0].a.get_text().encode('utf8') name = item.contents[1].a.get_text().encode('utf8') parent_code, parent_name = get_province_code(code) level = 2 insert_data(code, name, parent_code, parent_name, level) except Exception, e: print traceback.format_exc()
def crawl_city_list(): response = request_util(list_url, 'gb2312') try: soup = BeautifulSoup(response, "lxml") info_list = soup.find('table', class_="provincetable").find_all( "tr", class_="provincetr") for k, item in enumerate(info_list): content_list = item.find_all("a") for c_k, c_item in enumerate(content_list): d_url = common_url_prefix + c_item.attrs['href'].encode('utf8') print d_url crawl_city_detail(d_url) if (c_k > 1): break break except Exception, e: print traceback.format_exc()
def crawl_street_detail(url): response = request_util(url, 'gb2312') try: soup = BeautifulSoup(response, "lxml") info_list = soup.find('table', class_="towntable").find_all("tr", class_="towntr") for k, item in enumerate(info_list): if item.contents[0].find('a', {'href': True}): #street_url = street_url_prefix + item.contents[0].a.attrs['href'].encode('utf8') code = item.contents[0].a.get_text().encode('utf8') name = item.contents[1].a.get_text().encode('utf8') parent_code, parent_name = get_city_code(code) level = 4 print code, name, parent_code, parent_name #insert_data(code, name, parent_code, parent_name, level) except Exception, e: print traceback.format_exc()
def hosts_info_instance(url, timestamp=time.time(), timeout='2m', **kwargs): '''GET api/v1/hosts/<instance_name>''' results = {} node_info = read_info() url = url + '/api/v1/query' rlt_values = {} for info in node_info: query = info.strip('\n') param = { 'query': query + '{' + dic_to_str(kwargs) + '}', 'timestamp': timestamp, 'timeout': timeout } data = request_util(url, param) for i in range(len(data)): rlt_values.setdefault(data[i]['metric']['instance'], data[i]['value'][1]) results[query] = rlt_values logging.debug('host_info_instance results are : %s', results) return results
def hosts_info(url, timestamp=time.time(), timeout='2m'): results = {} node_info = read_info() url = url + '/api/v1/query' rlt_values = {} for info in node_info: query = info.strip('\n') param = { 'query': query, 'timestamp': timestamp, 'timeout': timeout } data = request_util(url, param) for i in range(len(data)): rlt_values.setdefault(data[i]['metric']['instance'],data[i]['value'][1]) results[query] = rlt_values logging.info('results is : %s', results) return results
def crawl_city_list(): global province_code_list province_code_list = query_data(1) response = request_util(city_url, 'gb2312') try: soup = BeautifulSoup(response, "lxml") info_list = soup.find('table', class_="provincetable").find_all( "tr", class_="provincetr") for k, item in enumerate(info_list): content_list = item.find_all("a") for c_k, c_item in enumerate(content_list): d_url = c_item.attrs['href'].encode('utf8') url_city_code = c_item.attrs['href'].encode('utf8').split( ".")[0] d_city_url = common_url_prefix + url_city_code + ".html" print d_city_url crawl_city_detail(d_city_url) # if(c_k > 1): # break except Exception, e: print traceback.format_exc()
def single_host_cpu(url, instance, timestamp=time.time(), timeout='2m'): ''' :param url: The url we want to query exactly. :param instance: a string format 'ip:port' :param timestamp: Unix timestamp, default to time.time() :param timeout: Query time out, default to 2 minutes :return: A dict contains the cpus info in the instance node. ''' url = url + '/api/v1/query' param = { "query": "node_cpu{{ instance='{0}' }}".format(instance), "timestamp": timestamp, "timeout": timeout } data = request_util(url, param) logging.info("getting cpu cores...") cores = cpu_cores(url) logging.info("The Cores of CPU are: %s", cores) cpu_results = { "instance": instance, "cpu": {} } logging.info("Result of request data is %s", len(data)) for i in range(cores): values = {} for index in range(len(data)): # 遍历整个请求结果,按照 cpu 进行分类 # 判断属于哪个 cpu, 根据 mode 类型不同,将不同的 mode 及其 value 打包成一个 dict,结果传入 values if data[index]['metric']['cpu'] == 'cpu{0}'.format(i): values.setdefault(data[index]['metric']['mode'], data[index]['value'][1]) # 不同的 cpu 对应一个 values 的 dict cpu_results['cpu']['cpu{0}'.format(i)] = values return cpu_results
def crawl_city_detail(url): response = request_util(url, 'gb2312') try: soup = BeautifulSoup(response, "lxml") info_list = soup.find('table', class_="citytable").find_all("tr", class_="citytr") for k, item in enumerate(info_list): code = item.contents[0].a.get_text().encode('utf8') name = item.contents[1].a.get_text().encode('utf8') c_code_item = {} c_code_item['code'] = code c_code_item['name'] = name city_code_list.append(c_code_item) for k, item in enumerate(info_list): # print common_url_prefix + item.contents[0].a.attrs['href'].encode('utf8') # print item.contents[0].a.get_text().encode('utf8') # print item.contents[1].a.get_text().encode('utf8') code = item.contents[0].a.get_text().encode('utf8') name = item.contents[1].a.get_text().encode('utf8') parent_code, parent_name = get_province_code(code) level = 2 insert_data(code, name, parent_code, parent_name, level) district_url = common_url_prefix + item.contents[0].a.attrs[ 'href'].encode('utf8') #print district_url crawl_district_detail(district_url) except Exception, e: print traceback.format_exc()
def single_host_info(url, instance, timestamp=time.time(), timeout='2m'): ''' :param url: The url we want to query exactly. :param instance: a string format 'ip:port' :param timestamp: Unix timestamp, default to time.time() :param timeout: Query time out, default to 2 minutes :return: A dict contains the cpus info in the instance node. ''' url = url + '/api/v1/query' data_results = { "href": "http://10.110.13.216:9200/api/v1/hosts/{0}?fields=metrics".format(instance), "instance": instance, "boot": {}, "cpu": {}, "filesystem": {}, "disk": {}, "load": {}, "memory": {}, "network": {}, "procs": {} } single_host = read_host_info() single_output = ["disk", "load", "memory", "procs"] multi_output = ["cpu", "filesystem", "network"] for info in single_host: logging.info('Reading %s successfully.', info) query = info.strip('\n') param = { 'query': query + "{{ instance='{0}' }}".format(instance), 'timestamp': timestamp, 'timeout': timeout } data = request_util(url, param) for key in single_output: # key 在 single_output 中,其输出是 key:{} 的方式 if key in info: data_results[key].setdefault(data[0]['metric']['__name__'], data[0]['value'][1]) if 'node_cpu' in info: # 判断有多少个网卡,按照 device 划分,因为 data 中的 device 有可能重复,所以将 device 放入集合去重 cpu_elements = set() for index in range(len(data)): cpu_elements.add(data[index]['metric']['cpu']) # for element in cpu_elements: data_results['cpu'].setdefault(element, {}) for index in range(len(data)): # 遍历整个请求结果,按照 network 的 device 进行分类 # 判断属于哪个 device, 将不同的 device 及其 value 打包成一个 dict,结果传入 data_results['network'][element] if data[index]['metric']['cpu'] == element: data_results['cpu'][element].setdefault(data[index]['metric']['mode'], data[index]['value'][1]) elif 'node_network' in info: # 判断有多少个网卡,按照 device 划分,因为 data 中的 device 有可能重复,所以将 device 放入集合去重 network_elements = set() for index in range(len(data)): network_elements.add(data[index]['metric']['device']) # for element in network_elements: data_results['network'].setdefault(element, {}) for index in range(len(data)): # 遍历整个请求结果,按照 network 的 device 进行分类 # 判断属于哪个 device, 将不同的 device 及其 value 打包成一个 dict,结果传入 data_results['network'][element] if data[index]['metric']['device'] == element: data_results['network'][element].setdefault(data[index]['metric']['__name__'], data[index]['value'][1]) elif 'node_filesystem' in info: pprint(data) # data_results['filesystem'].setdefault(data[0]['metric']['__name__'], data[0]['value'][1]) # 判断有多少个文件系统,按照 device 划分,因为 data 中的 device 有可能重复,所以将 device 放入集合去重 fs_elements = set() for index in range(len(data)): fs_elements.add(data[index]['metric']['device']) # for element in fs_elements: data_results['filesystem'].setdefault(element, {}) for index in range(len(data)): # 遍历整个请求结果,按照 filesystem 的 device 进行分类 # 判断属于哪个 device, 将不同的 device 及其 value 打包成一个 dict,结果传入 data_result['filesystem'][element] if data[index]['metric']['device'] == element: data_results['filesystem'][element].setdefault(data[index]['metric']['__name__'], data[index]['value'][1]) elif 'node_boot' in info: data_results['boot'].setdefault(data[0]['metric']['__name__'], time.time() - float(data[0]['value'][1])) else: pass # logging.info("Result of request data is %s", data) return data_results