Ejemplo n.º 1
0
def crawl_street_list():
    # page_url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/52/03/520324.html"
    # response = request_util(page_url,'gb18030');
    # print response
    # return
    global district_code_list
    district_code_list = query_data(3)

    
    try:
        page_urls = generate_page_url()
        for k,page_item in enumerate(page_urls):
            page_url = page_item['page_url']
            print page_url
            if (page_url in special_url_conn):
                for item in special_url:
                    response = request_util(item['page_url'],item['encoding']);
            else:
                response = request_util(page_url,'gbk');
            soup = BeautifulSoup(response, "lxml")
            info_list = soup.find('table',class_="towntable").find_all("tr",class_="towntr")
            for k,item in enumerate(info_list):

                if item.contents[0].find('a',{'href':True}):
                    #street_url = street_url_prefix + item.contents[0].a.attrs['href'].encode('utf8')
                    code = item.contents[0].a.get_text().encode('utf8')
                    name = item.contents[1].a.get_text().encode('utf8')
                    parent_code,parent_name = get_district_code(code)
                    level = 4
                    print code, name, parent_code, parent_name
                    insert_data(code, name, parent_code, parent_name, level)
           
    except Exception, e:
        print traceback.format_exc()
Ejemplo n.º 2
0
def crawl_community_list():
   
    global street_code_list
    street_code_list = query_data(4)
    
    try:
        page_urls = generate_page_url()
        for k,page_item in enumerate(page_urls):
            page_url = page_item['page_url']
            print page_url
            if (page_url in special_url_conn):
                for item in special_url:
                    response = request_util(item['page_url'],item['encoding']);
            else:
                response = request_util(page_url,'gbk');
            soup = BeautifulSoup(response, "lxml")
            info_list = soup.find('table',class_="villagetable").find_all("tr",class_="villagetr")
            for k,item in enumerate(info_list):
               
                #street_url = street_url_prefix + item.contents[0].a.attrs['href'].encode('utf8')
                code = item.contents[0].get_text().encode('utf8')
                name = item.contents[2].get_text().encode('utf8')
                parent_code,parent_name = get_street_code(code)
                level = 5
                print code, name, parent_code, parent_name
                insert_data(code, name, parent_code, parent_name, level)
           
    except Exception, e:
        print traceback.format_exc()
Ejemplo n.º 3
0
def crawl_page(url):

    try:
        response = request_util(url)
        soup = BeautifulSoup(response, "html.parser")
        info_list = soup.find('ul', class_="subject-list").find_all('li')
        for item in info_list:
            print item.find(
                'div',
                class_="info").find("h2").find("a").get_text().strip().replace(
                    '\n', '').replace(' ', '').encode('utf8')
            print item.find("div", class_="star").find(
                "span", class_="rating_nums").get_text().strip()
            authors = item.find("div",
                                class_="pub").get_text().strip().split("/")

            new_authors = []
            for item in authors:
                new_authors.append(item.strip())

            for i in range(-3, 0):
                new_authors.pop(i)

            print new_authors
            #print '/'.join(new_authors)

    except Exception, e:
        print traceback.format_exc()
Ejemplo n.º 4
0
def crawl_tags():
    response = request_util(url)
    try:
        print response

    except Exception, e:
        print traceback.format_exc()
Ejemplo n.º 5
0
def crawl_province_list():
    response = request_util(province_url)
    try:
        soup = BeautifulSoup(response, "lxml")
        info_list = soup.find('div', class_="TRS_PreAppend").find_all(
            "p", class_="MsoNormal")
        for k, item in enumerate(info_list):

            code_item = {}
            code = item.find("span", attrs={'lang': 'EN-US'})
            code_item['code'] = code.get_text().strip()
            content_list = item.find_all("span")
            code_item['name'] = content_list[len(content_list) -
                                             1].get_text().strip()
            code_list.append(code_item)

        for k, item in enumerate(code_list):
            if item['code'].find("0000") > 0:
                code = item['code'].encode('utf8') + "000000"
                name = item['name'].encode('utf8')
                parent_code = 0
                level = 1

                p_code_item = {}
                p_code_item['code'] = code
                p_code_item['name'] = name
                province_code_list.append(p_code_item)
                #insert_data(code, name, parent_code, level)

    except Exception, e:
        print traceback.format_exc()
Ejemplo n.º 6
0
def crawl_district_list():
    global city_code_list
    city_code_list = query_data(2)

    try:
        page_urls = generate_page_url()
        for k, page_url in enumerate(page_urls):
            print page_url
            response = request_util(page_url, 'gbk')
            soup = BeautifulSoup(response, "lxml")
            info_list = soup.find('table', class_="countytable").find_all(
                "tr", class_="countytr")
            for k, item in enumerate(info_list):

                if item.contents[0].find('a', {'href': True}):
                    #street_url = common_url_prefix + url_code + item.contents[0].a.attrs['href'].encode('utf8')
                    code = item.contents[0].a.get_text().encode('utf8')

                    name = item.contents[1].a.get_text().encode('utf8')
                    parent_code, parent_name = get_city_code(code)
                    level = 3
                    print code, name, parent_code, parent_name
                    insert_data(code, name, parent_code, parent_name, level)

                    #crawl_street_detail(street_url)

    except Exception, e:
        print traceback.format_exc()
Ejemplo n.º 7
0
def crawl_district_detail(url, url_code):
    response = request_util(url, 'gb2312')
    try:
        soup = BeautifulSoup(response, "lxml")
        info_list = soup.find('table',
                              class_="countytable").find_all("tr",
                                                             class_="countytr")

        for k, item in enumerate(info_list):
            code = item.contents[0].a.get_text().encode('utf8')
            name = item.contents[1].a.get_text().encode('utf8')
            d_code_item = {}
            d_code_item['code'] = code
            d_code_item['name'] = name
            district_code_list.append(d_code_item)

        for k, item in enumerate(info_list):

            if item.contents[0].find('a', {'href': True}):
                street_url = common_url_prefix + url_code + item.contents[
                    0].a.attrs['href'].encode('utf8')
                code = item.contents[0].a.get_text().encode('utf8')
                name = item.contents[1].a.get_text().encode('utf8')
                parent_code, parent_name = get_city_code(code)
                level = 3
                #print code, name, parent_code, parent_name
                insert_data(code, name, parent_code, parent_name, level)

                crawl_street_detail(street_url)

    except Exception, e:
        print traceback.format_exc()
Ejemplo n.º 8
0
def crawl_tags():
    response = request_util(book_tags_url)
    try:
        soup = BeautifulSoup(response, "html.parser")
        info_list = soup.find_all('table', class_="tagCol")
        for item in info_list:
            content_list = item.find_all("a")
            for content in content_list:
                page_url_prefix = book_url + content.attrs['href'].encode(
                    'utf8')
                tag = page_url_prefix.split("tag/", 1)[1]
                crawl_book_list(page_url_prefix, tag)

    except Exception, e:
        print traceback.format_exc()
Ejemplo n.º 9
0
def crawl_city_detail(url):

    response = request_util(url, 'gb2312')
    try:
        soup = BeautifulSoup(response, "lxml")
        info_list = soup.find('table',
                              class_="citytable").find_all("tr",
                                                           class_="citytr")

        for k, item in enumerate(info_list):

            code = item.contents[0].a.get_text().encode('utf8')
            name = item.contents[1].a.get_text().encode('utf8')
            parent_code, parent_name = get_province_code(code)
            level = 2
            insert_data(code, name, parent_code, parent_name, level)

    except Exception, e:
        print traceback.format_exc()
Ejemplo n.º 10
0
def crawl_city_list():
    response = request_util(list_url, 'gb2312')
    try:
        soup = BeautifulSoup(response, "lxml")
        info_list = soup.find('table', class_="provincetable").find_all(
            "tr", class_="provincetr")
        for k, item in enumerate(info_list):

            content_list = item.find_all("a")
            for c_k, c_item in enumerate(content_list):
                d_url = common_url_prefix + c_item.attrs['href'].encode('utf8')
                print d_url
                crawl_city_detail(d_url)
                if (c_k > 1):
                    break

            break

    except Exception, e:
        print traceback.format_exc()
Ejemplo n.º 11
0
def crawl_street_detail(url):
    response = request_util(url, 'gb2312')
    try:
        soup = BeautifulSoup(response, "lxml")
        info_list = soup.find('table',
                              class_="towntable").find_all("tr",
                                                           class_="towntr")
        for k, item in enumerate(info_list):

            if item.contents[0].find('a', {'href': True}):
                #street_url = street_url_prefix + item.contents[0].a.attrs['href'].encode('utf8')
                code = item.contents[0].a.get_text().encode('utf8')
                name = item.contents[1].a.get_text().encode('utf8')
                parent_code, parent_name = get_city_code(code)
                level = 4
                print code, name, parent_code, parent_name
                #insert_data(code, name, parent_code, parent_name, level)

    except Exception, e:
        print traceback.format_exc()
Ejemplo n.º 12
0
def hosts_info_instance(url, timestamp=time.time(), timeout='2m', **kwargs):
    '''GET api/v1/hosts/<instance_name>'''
    results = {}
    node_info = read_info()

    url = url + '/api/v1/query'
    rlt_values = {}

    for info in node_info:
        query = info.strip('\n')
        param = {
            'query': query + '{' + dic_to_str(kwargs) + '}',
            'timestamp': timestamp,
            'timeout': timeout
        }
        data = request_util(url, param)

        for i in range(len(data)):
            rlt_values.setdefault(data[i]['metric']['instance'], data[i]['value'][1])
        results[query] = rlt_values
        logging.debug('host_info_instance results are : %s', results)
    return results
Ejemplo n.º 13
0
def hosts_info(url, timestamp=time.time(), timeout='2m'):
    results = {}
    node_info = read_info()

    url = url + '/api/v1/query'
    rlt_values = {}

    for info in node_info:
        query = info.strip('\n')
        param = {
            'query': query,
            'timestamp': timestamp,
            'timeout': timeout
        }

        data = request_util(url, param)

        for i in range(len(data)):
            rlt_values.setdefault(data[i]['metric']['instance'],data[i]['value'][1])
        results[query] = rlt_values
        logging.info('results is : %s', results)
    return results
Ejemplo n.º 14
0
def crawl_city_list():
    global province_code_list
    province_code_list = query_data(1)
    response = request_util(city_url, 'gb2312')
    try:
        soup = BeautifulSoup(response, "lxml")
        info_list = soup.find('table', class_="provincetable").find_all(
            "tr", class_="provincetr")
        for k, item in enumerate(info_list):
            content_list = item.find_all("a")
            for c_k, c_item in enumerate(content_list):
                d_url = c_item.attrs['href'].encode('utf8')
                url_city_code = c_item.attrs['href'].encode('utf8').split(
                    ".")[0]
                d_city_url = common_url_prefix + url_city_code + ".html"
                print d_city_url
                crawl_city_detail(d_city_url)
                # if(c_k > 1):
                #     break

    except Exception, e:
        print traceback.format_exc()
Ejemplo n.º 15
0
def single_host_cpu(url, instance, timestamp=time.time(), timeout='2m'):
    '''
    :param url: The url we want to query exactly.
    :param instance: a string format 'ip:port'
    :param timestamp: Unix timestamp, default to time.time()
    :param timeout: Query time out, default to 2 minutes
    :return: A dict contains the cpus info in the instance node.
    '''
    url = url + '/api/v1/query'
    param = {
        "query": "node_cpu{{ instance='{0}' }}".format(instance),
        "timestamp": timestamp,
        "timeout": timeout
    }

    data = request_util(url, param)

    logging.info("getting cpu cores...")
    cores = cpu_cores(url)
    logging.info("The Cores of CPU are: %s", cores)
    cpu_results = {
        "instance": instance,
        "cpu": {}
    }
    logging.info("Result of request data is %s", len(data))

    for i in range(cores):
        values = {}
        for index in range(len(data)):
            # 遍历整个请求结果,按照 cpu 进行分类
            # 判断属于哪个 cpu, 根据 mode 类型不同,将不同的 mode 及其 value 打包成一个 dict,结果传入 values
            if data[index]['metric']['cpu'] == 'cpu{0}'.format(i):
                values.setdefault(data[index]['metric']['mode'], data[index]['value'][1])
        # 不同的 cpu 对应一个 values 的 dict
        cpu_results['cpu']['cpu{0}'.format(i)] = values
    return cpu_results
Ejemplo n.º 16
0
def crawl_city_detail(url):

    response = request_util(url, 'gb2312')
    try:
        soup = BeautifulSoup(response, "lxml")
        info_list = soup.find('table',
                              class_="citytable").find_all("tr",
                                                           class_="citytr")

        for k, item in enumerate(info_list):
            code = item.contents[0].a.get_text().encode('utf8')
            name = item.contents[1].a.get_text().encode('utf8')
            c_code_item = {}
            c_code_item['code'] = code
            c_code_item['name'] = name
            city_code_list.append(c_code_item)

        for k, item in enumerate(info_list):

            # print common_url_prefix + item.contents[0].a.attrs['href'].encode('utf8')
            # print item.contents[0].a.get_text().encode('utf8')
            # print item.contents[1].a.get_text().encode('utf8')

            code = item.contents[0].a.get_text().encode('utf8')
            name = item.contents[1].a.get_text().encode('utf8')
            parent_code, parent_name = get_province_code(code)
            level = 2
            insert_data(code, name, parent_code, parent_name, level)

            district_url = common_url_prefix + item.contents[0].a.attrs[
                'href'].encode('utf8')
            #print district_url
            crawl_district_detail(district_url)

    except Exception, e:
        print traceback.format_exc()
Ejemplo n.º 17
0
def single_host_info(url, instance, timestamp=time.time(), timeout='2m'):
    '''
    :param url: The url we want to query exactly.
    :param instance: a string format 'ip:port'
    :param timestamp: Unix timestamp, default to time.time()
    :param timeout: Query time out, default to 2 minutes
    :return: A dict contains the cpus info in the instance node.
    '''
    url = url + '/api/v1/query'
    data_results = {
        "href": "http://10.110.13.216:9200/api/v1/hosts/{0}?fields=metrics".format(instance),
        "instance": instance,
        "boot": {},
        "cpu": {},
        "filesystem": {},
        "disk": {},
        "load": {},
        "memory": {},
        "network": {},
        "procs": {}
    }
    single_host = read_host_info()
    single_output = ["disk", "load", "memory", "procs"]
    multi_output = ["cpu", "filesystem", "network"]


    for info in single_host:
        logging.info('Reading %s successfully.', info)
        query = info.strip('\n')
        param = {
            'query': query + "{{ instance='{0}' }}".format(instance),
            'timestamp': timestamp,
            'timeout': timeout
        }
        data = request_util(url, param)
        for key in single_output:
            # key 在 single_output 中,其输出是 key:{} 的方式
            if key in info:
                data_results[key].setdefault(data[0]['metric']['__name__'], data[0]['value'][1])

        if 'node_cpu' in info:
            # 判断有多少个网卡,按照 device 划分,因为 data 中的 device 有可能重复,所以将 device 放入集合去重
            cpu_elements = set()
            for index in range(len(data)):
                cpu_elements.add(data[index]['metric']['cpu'])
            #
            for element in cpu_elements:
                data_results['cpu'].setdefault(element, {})
                for index in range(len(data)):
                    # 遍历整个请求结果,按照 network 的 device 进行分类
                    # 判断属于哪个 device, 将不同的 device 及其 value 打包成一个 dict,结果传入 data_results['network'][element]
                    if data[index]['metric']['cpu'] == element:
                        data_results['cpu'][element].setdefault(data[index]['metric']['mode'],
                                                                data[index]['value'][1])

        elif 'node_network' in info:
            # 判断有多少个网卡,按照 device 划分,因为 data 中的 device 有可能重复,所以将 device 放入集合去重
            network_elements = set()
            for index in range(len(data)):
                network_elements.add(data[index]['metric']['device'])
            #
            for element in network_elements:
                data_results['network'].setdefault(element, {})
                for index in range(len(data)):
                    # 遍历整个请求结果,按照 network 的 device 进行分类
                    # 判断属于哪个 device, 将不同的 device 及其 value 打包成一个 dict,结果传入 data_results['network'][element]
                    if data[index]['metric']['device'] == element:
                        data_results['network'][element].setdefault(data[index]['metric']['__name__'],
                                                                    data[index]['value'][1])

        elif 'node_filesystem' in info:
            pprint(data)
            # data_results['filesystem'].setdefault(data[0]['metric']['__name__'], data[0]['value'][1])
            # 判断有多少个文件系统,按照 device 划分,因为 data 中的 device 有可能重复,所以将 device 放入集合去重
            fs_elements = set()
            for index in range(len(data)):
                fs_elements.add(data[index]['metric']['device'])
            #
            for element in fs_elements:
                data_results['filesystem'].setdefault(element, {})
                for index in range(len(data)):
                    # 遍历整个请求结果,按照 filesystem 的 device 进行分类
                    # 判断属于哪个 device, 将不同的 device 及其 value 打包成一个 dict,结果传入 data_result['filesystem'][element]
                    if data[index]['metric']['device'] == element:
                        data_results['filesystem'][element].setdefault(data[index]['metric']['__name__'],
                                                                       data[index]['value'][1])

        elif 'node_boot' in info:
            data_results['boot'].setdefault(data[0]['metric']['__name__'],
                                            time.time() - float(data[0]['value'][1]))
        else:
            pass
        # logging.info("Result of request data is %s", data)
    return data_results