Example #1
0
class WebScraper:
    def __init__(self, start_month="2017/01"):

        # 其他网页
        self._crawler = Crawler(start_month=start_month)

        # 国家统计局网页
        self._url = "http://data.stats.gov.cn/easyquery.htm?"
        self._sj_start = start_month.replace("/", "")
        self._index_dict = {
            "制造业采购经理指数(%)": "A0B0101",
            "非制造业商务活动指数(%)": "A0B0201",
            "工业生产者出厂价格指数(上年同月=100)": "A01080101",
            "工业增加值累计增长(%)": "A020102",
            "房地产投资累计值(亿元)": "A060101",
            "流通中现金(M0)供应量期末值(亿元)": "A0D0105",
            "货币(M1)供应量期末值(亿元)": "A0D0103",
            "货币和准货币(M2)供应量期末值(亿元)": "A0D0101",
            "GDP": "A010201",

            # "生产指数": "A0B0102",
            # "新订单指数": "A0B0103",
            # "新出口订单指数": "A0B0104",
            # "在手订单指数": "A0B0105",
            # "产成品库存指数": "A0B0106",
            # "采购量指数": "A0B0107",
            # "进口指数": "A0B0108",
            # "出厂价格指数": "A0B0109",
            # "主要原材料购进价格指数": "A0B010A",
            # "原材料库存指数": "A0B010B",
            # "从业人员指数": "A0B010C",
            # "供应商配送时间指数": "A0B010D",
            # "生产经营活动预期指数": "A0B010E",
            #
            # "新订单指数": "A0B0202",
            # "新出口订单指数": "A0B0203",
            # "在手订单指数": "A0B0204",
            # "存货指数": "A0B0205",
            # "投入品价格指数": "A0B0206",
            # "销售价格指数": "A0B0207",
            # "从业人员指数": "A0B0208",
            # "供应商配送时间指数": "A0B0209",
            # "业务活动预期指数": "A0B020A",
            #
            # "综合PMI产出指数": "A0B0301"
        }

        self._data_by_quarter = ["GDP"]

    # ----------- private methods ----------- #

    def _extract_json(self, jd):
        """
        :param jd: loaded json data
        :return: two lists (index data & month)
        """
        d = {
            "A": ["01", "02", "03"],
            "B": ["04", "05", "06"],
            "C": ["07", "08", "09"],
            "D": ["10", "11", "12"]
        }
        datanode_list = jd["returndata"]["datanodes"]
        data_list, month_list = [], []

        for node in datanode_list:

            hasdata = node["data"]["hasdata"]
            data = node["data"]["data"] if hasdata else np.nan
            time = node["wds"][1]["valuecode"]

            # data by month
            if time[-1].isdigit():
                year, month = time[:4], time[4:]
                time = year + "/" + month
                data_list.append(data)
                month_list.append(time)

            # data by quarter
            else:
                year, quarter = time[:4], time[4:]
                months = d[quarter]  # list of months
                data_list.extend([data] * len(months))
                time = [year + "/" + month for month in months]
                month_list.extend(time)

        return data_list, month_list

    def _get_url(self, key, sj_start):
        """
        one query can only contain one single index
        :param zb: target index
        :param sj_start: first month of data record (eg.201701)
        :return: html-encoded url for http request
        """
        zb = self._index_dict[key]
        if key in self._data_by_quarter: dbcode = "hgjd"
        else: dbcode = "hgyd"

        query = {
            "m":
            "QueryData",
            "dbcode":
            dbcode,
            "rowcode":
            "zb",
            "colcode":
            "sj",
            "wds": [],
            "dfwds": [{
                "wdcode": "zb",
                "valuecode": zb
            }, {
                "wdcode": "sj",
                "valuecode": sj_start + "-"
            }]
        }
        query_string = parse.urlencode(query, quote_via=parse.quote)
        url = self._url + query_string
        url = url.replace("%27", "%22")  # replace '' by "" in encoded string

        return url

    def _get_json_data(self, url):

        web_data = requests.get(url).text
        return json.loads(web_data)

    def _scrape(self, key):

        if key in ["企业商品价格指数"]:
            return self._crawler.crawl_index(key)
        else:

            url = self._get_url(key, self._sj_start)
            json_data = self._get_json_data(url)
            data_list, month_list = self._extract_json(json_data)
            data_dict = {"Month": month_list, key: data_list}
            return pd.DataFrame(data=data_dict)

    # ----------- callable methods ----------- #

    def scrape(self, index_key):

        if isinstance(index_key, str):
            return self._scrape(index_key)

        elif isinstance(index_key, list):
            _scrape = lambda key: self._scrape(key)
            dfs = list(map(_scrape, index_key))
            _merge = lambda df1, df2: pd.merge(
                df1, df2, on="Month", how="outer")
            return reduce(_merge, dfs)

        else:
            raise ValueError("Unsupported argument type, use string or list")