Exemple #1
0
    def store_label_to_db(self, label_collection=None):
        """ 把变量和值标签关联存储到数据库

        :param label_collection:
        :return: 返回self
        """
        if label_collection is None:
            label_collection = MonCollection(
                database=MonDatabase(mongodb=MongoDB(),
                                     database_name='surveydata'),
                collection_name='cgsslabel').collection

        for year in self._stata_label_object:
            stata_label_data = self._stata_label_object[year].read()
            records = dict(
                zip(stata_label_data.loc[:, "name"],
                    stata_label_data.loc[:, "vallab"]))
            records["year"] = year
            records["type"] = "variable value lables"
            print(records)
            label_collection.insert_one(records)

        return self
Exemple #2
0
    def store_data_to_db(self, data_collection=None, label_collection=None):
        """ 把stata对象中的数据存入数据库

        :param data_collection:
        :param label_collection:
        :return: 返回self
        """
        if data_collection is None:
            data_collection = MonCollection(
                database=MonDatabase(mongodb=MongoDB(),
                                     database_name='surveydata'),
                collection_name='cgssdata').collection

        if label_collection is None:
            label_collection = MonCollection(
                database=MonDatabase(mongodb=MongoDB(),
                                     database_name='surveydata'),
                collection_name='cgsslabel').collection

        for year in self._stata_object:

            stata_data = self._stata_object[year].read()
            records = stata_data.to_dict("records")
            for record in records:
                record["year"] = year
                print(record)
                data_collection.insert_one(record)

            value_labels = self._stata_object[year].value_labels
            str_value_labels = dict()
            for key in value_labels:
                str_value_labels[key] = {
                    str(inn_key): value_labels[key][inn_key]
                    for inn_key in value_labels[key]
                }
            str_value_labels["year"] = year
            str_value_labels["type"] = "value labels"
            print(str_value_labels)
            label_collection.insert_one(str_value_labels)

            variable_labels = self._stata_object[year].variable_labels
            variable_labels["year"] = year
            variable_labels["type"] = "variable labels"
            print(variable_labels)
            label_collection.insert_one(variable_labels)

        return self
class CollegeInfo():
    def __init__(self):
        mongo = MongoDB(conn_str='localhost:27017')
        self._college_info = MonCollection(
            mongo, database='webdata',
            collection_name='college_info').collection
        self._college_intro = MonCollection(
            mongo, database='webdata',
            collection_name='college_introduction').collection

        self._headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        }

    def init_first_stage(self):
        web_fmt = "http://college.gaokao.com/schlist/a{}/p{}"

        for i in range(1, 32):
            url = web_fmt.format(str(i), '1')
            raw_result = requests.get(url, headers=self._headers).text
            bs_obj = BeautifulSoup(raw_result, "lxml")
            for string in bs_obj.select('#qx')[0].strings:
                total_pages = re.split('页', re.split('/', string)[1])[0]
                break

            for j in range(1, int(total_pages) + 1):
                surf_url = web_fmt.format(str(i), str(j))
                print(surf_url)
                surf_result = requests.get(surf_url,
                                           headers=self._headers).text
                surf_obj = BeautifulSoup(surf_result, "lxml")
                surf_content = surf_obj.select('.scores_List')[0]

                colleges = [
                    item.attrs['title']
                    for item in surf_content.select('.blue')
                ]

                college_info = []
                for ul_item in surf_content.select('ul'):
                    one_college_info = dict()
                    for n in range(len(ul_item.select('li'))):
                        if n == 1:
                            college_type = (ul_item.select('li')[n]).contents
                            if len(college_type) == 1:
                                one_college_info['985'] = False
                                one_college_info['211'] = False
                            elif len(college_type) == 2:
                                if college_type[1].string == '211':
                                    one_college_info['985'] = False
                                    one_college_info['211'] = True
                                elif college_type[1].string == '985':
                                    one_college_info['985'] = True
                                    one_college_info['211'] = False
                                else:
                                    raise Exception
                            else:
                                one_college_info['985'] = True
                                one_college_info['211'] = True
                        else:
                            key, value = re.split(
                                ':', (ul_item.select('li')[n]).string)
                            if value == '——' or value == '------':
                                value = None
                            one_college_info[key] = value
                    college_info.append(one_college_info)

                for m in range(len(colleges)):
                    college_info[m]['学校'] = colleges[m]

                for college in college_info:
                    found = self._college_info.find_one(college)
                    if found is None:
                        print('Insert..', college)
                        self._college_info.insert_one(college)
class GaoKaoWebScraper():
    def __init__(self):
        mongo = MongoDB(conn_str='localhost:27017')
        self._web_conn = MonCollection(mongo,
                                       database='cache',
                                       collection_name='gaokaoweb').collection
        self._data_web_conn = MonCollection(
            mongo, database='cache',
            collection_name='gaokaodataweb').collection
        self._university_web_conn = MonCollection(
            mongo, database='cache',
            collection_name='gaokaouniversityweb').collection
        self._data_conn = MonCollection(
            mongo, database='webdata',
            collection_name='gaokao_entrancescore').collection
        self._copy_data_web_conn = MonCollection(
            mongo, database='webdata',
            collection_name='gaokaouniversityweb').collection

    def init_first_stage(self):
        web_fmt = "http://college.gaokao.com/schpoint/{}/{}/{}/"
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        }

        for i in range(1, 32):
            for j in range(1, 32):
                url = web_fmt.format(''.join(['a', str(i)]),
                                     ''.join(['b', str(j)]), 'p1')
                raw_result = requests.get(url, headers=headers).text
                bs_obj = BeautifulSoup(raw_result, "lxml")
                for string in bs_obj.select('#qx')[0].strings:
                    total_pages = re.split('页', re.split('/', string)[1])[0]
                    break

                if len(total_pages) > 0:
                    for m in range(1, int(total_pages) + 1):
                        web = web_fmt.format(''.join(['a', str(i)]),
                                             ''.join(['b', str(j)]),
                                             ''.join(['p', str(m)]))
                        record = {'type': 'search', 'url': web}
                        print(record)
                        self._web_conn.insert_one(record)

    def init_second_stage(self):
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        }
        item = self._web_conn.find({'type': 'search'})
        for aitem in item:
            raw_result = requests.get(aitem['url'], headers=headers).text
            bs_obj = BeautifulSoup(raw_result, "lxml")
            for obj in bs_obj.select('.blue'):
                found = obj.find_all(href=re.compile("result"))
                if len(found) > 0:
                    url = found[0]['href']
                    record = {'type': 'data', 'url': url}
                    print(record)
                    self._data_web_conn.insert_one(record)

    def init_three_stage(self):
        university_urls = self._data_web_conn.find().distinct('url')
        for url in university_urls:
            self._university_web_conn.insert_one({'url': url})

    def scrape(self, using_proxy=False):
        vars = ['年份', '最低', '最高', '平均', '录取人数', '录取批次']
        nums = self._copy_data_web_conn.count()
        while nums > 0:
            urls = [
                item['url'] for item in self._copy_data_web_conn.find(limit=5)
            ]
            print(urls)
            start = time.time()
            scraper = StaticWebScraper(urls=urls, using_proxy=using_proxy)
            scraper.start()

            for html in scraper.result:
                url = html[1]
                bs_obj = BeautifulSoup(html[0], "lxml")
                record = dict(
                    zip(['university', 'region', 'type'], [
                        item.contents[0]
                        for item in bs_obj.select('.btnFsxBox > font')
                    ]))

                htmlparser = HtmlParser(html_content=bs_obj)
                table = htmlparser.table('#pointbyarea > table')
                if len(table) > 0:
                    for item in table:
                        copy_record = copy.copy(record)
                        if len(item) == 0:
                            continue
                        if len(item) == 6:
                            for i in range(len(item)):
                                if i in [0, 1, 2, 3, 4]:
                                    if item[i] == '------':
                                        copy_record[vars[i]] = None
                                    else:
                                        copy_record[vars[i]] = int(
                                            float(item[i]))
                                else:
                                    if item[i] == '------':
                                        copy_record[vars[i]] = None
                                    else:
                                        copy_record[vars[i]] = item[i]
                        else:
                            raise Exception

                        found = self._data_conn.find_one(copy_record)
                        if found is None:
                            print('Insert..', copy_record)
                            self._data_conn.insert_one(copy_record)
                self._copy_data_web_conn.delete_one({'url': url})

            print('Total: {}'.format(time.time() - start))
            nums = self._copy_data_web_conn.count()