Ejemplo n.º 1
0
def formatcity_two():
    city_list = []
    for i in city_list_two:
        standard_city(i)
        print(standard_city(i))
        if standard_city(i)[0] is True:
            city_list.append(standard_city(i))
        else:
            print('wrong')
    print(set(city_list))
    print(len(set(city_list)))
Ejemplo n.º 2
0
def update_lagou_fields():
    companys = collection.find({'company_source': '拉钩'}, no_cursor_timeout=True)
    for company in companys[220000:]:
        address = company['address']
        city = company['city']
        region = company['region']
        if city is not None and region is not None and address is not None:
            address_string = city + region + address
        elif city is not None and address is not None and region is None:
            address_string = city + address
        elif city is None and address is not None and region is not None:
            address_string = address + region
        elif city is not None and region is not None and address is None:
            address_string = city + region
        elif city is None and address is not None and region is None:
            address_string = address
        elif city is not None and region is None and address is None:
            address_string = city
        else:
            address_string = ''
        result, real_city = standard_city(address_string)
        if result:
            company['fj_city'] = real_city
            r, real_region = standard_region(real_city, address_string)
            if r:
                company['fj_region'] = real_region
            else:
                company['fj_region'] = None
        else:
            company['fj_city'] = None
            company['fj_region'] = None
        collection.update_one({'company_id': company['company_id'], 'company_source': company['company_source']},
                              {'$set': company})
        print('{}已经更新了'.format(company['company_id']))
Ejemplo n.º 3
0
 def insert_db(self):
     data = self.serialization_info()
     data['crawler_time'] = datetime.datetime.now()
     if data['city'] and data['region']:
         standard_string = data['city'] + data['region']
     else:
         standard_string = None
         # 格式化城市区域
     result, real_city = standard_city(data['city'])
     if result:
         data['fj_city'] = real_city
         try:
             r, real_region = standard_region(real_city, standard_string)
             if r:
                 data['fj_region'] = real_region
             else:
                 data['fj_region'] = None
         except StandarCityError as e:
             log.error(e)
     else:
         data['fj_city'] = None
         data['fj_region'] = None
         # 创建组合索引时使用的代码
     try:
         collection.insert_one(data)
         log.info('插入数据={}'.format(data))
     except DuplicateKeyError as e:
         log.error('该数据已经存在,company_source={},company_id={}'.format(
             data['company_source'], data['company_id']))
Ejemplo n.º 4
0
    def insert_db(self):
        data = serialization_info(self)
        compare(data)

        city_success, data['city'] = standard_city(data['city'])
        region_success, data['region'] = standard_block(
            data['city'], data['region'])

        # todo 插入判断

        if city_success is False or region_success is False:
            log.error('城市区域数据格式化失败data={}'.format(data))

        elif not coll.find_one({
                'city': data['city'],
                'region': data['region'],
                'district_name': data['district_name'],
                'source': data['source'],
                'trade_date': data['trade_date'],
                'area': data['area']
        }):
            coll.insert_one(data)
            log.info('插入数据={}'.format(data))

        else:
            log.info('已经存在数据={}'.format(data))
def delete_region():
    count = 0
    for i in collection.find({'region': None}, no_cursor_timeout=True):
        city = i['city']
        name = i['district_name']
        result, city_fj = standard_city(city)
        if result:
            d = collection_offline.find_one({'city': city_fj, 'name': name})
            if d:
                print('库里小区名={}, 成交小区名={}'.format(d['name'], name))
                collection.update_one({'_id': i['_id']},
                                      {'$set': {
                                          'region': d['region']
                                      }})
            a = collection_offline.find_one({'city': city_fj, 'alias': name})
            if a:
                print('找到成交别名了,库里小区名={}, 成交小区名={}'.format(a['name'], name))
                collection.update_one({'_id': i['_id']},
                                      {'$set': {
                                          'region': a['region']
                                      }})
        else:
            print('匹配不到')
            collection.remove({'_id': i['_id']})
            count = count + 1
            continue
    print('delete count={}'.format(count))
Ejemplo n.º 6
0
def mongo_chanch():
    for i in coll_name.find({}, no_cursor_timeout=True):
        name = i['comm_name']
        city_name_ = i['city']
        DistrictName_ = i['comm_addr']
        UnitPrice = int(i['price'])
        update_time = i['time']
        category = 'district'
        s_date = int(update_time.strftime('%Y%m'))
        city_name = standard_city(city_name_)
        DistrictName = standard_block(DistrictName_)
        data = {
            'category': category,
            'city': city_name,
            'name': name,
            'region': DistrictName,
            's_date': s_date,
            'zhugefang_esf_price': UnitPrice,
        }
        if not data['region']:
            continue
        print(data)
        save_coll.update_one(
            {
                'region': DistrictName,
                'city': city_name,
                'name': name
            }, {'$set': data},
            upsert=True)
Ejemplo n.º 7
0
 async def standar_address(self, company):
     address = company['address']
     city = company['city']
     region = company['region']
     if city is not None and region is not None and address is not None:
         address_string = city + region + address
     elif city is not None and address is not None and region is None:
         address_string = city + address
     elif city is None and address is not None and region is not None:
         address_string = address + region
     elif city is not None and region is not None and address is None:
         address_string = city + region
     elif city is None and address is not None and region is None:
         address_string = address
     elif city is not None and region is None and address is None:
         address_string = city
     else:
         address_string = ''
     result, real_city = standard_city(address_string)
     if result:
         company['fj_city'] = real_city
         r, real_region = standard_region(real_city, address_string)
         if r:
             company['fj_region'] = real_region
         else:
             company['fj_region'] = None
     else:
         company['fj_city'] = None
         company['fj_region'] = None
     return company
Ejemplo n.º 8
0
def mongo_chanch():
    for i in coll_price.find({}, no_cursor_timeout=True):
        try:
            ResidentialAreaID = i['ResidentialAreaID']
            city_name_ = i['city_name']
            DistrictName_ = i['DistrictName']
            UnitPrice = i['UnitPrice']
            update_time = i['update_time']
            name = \
                coll_name.find_one(
                    {'ResidentialAreaID': ResidentialAreaID, 'city_name': city_name_, 'DistrictName': DistrictName_})[
                    'baseinfo']['json'][0]['residentialareaMap']['residentialareaName']
            category = 'district'
            s_date = int(update_time.strftime('%Y%m'))
            city_name = standard_city(city_name_)
            DistrictName = standard_block(DistrictName_)
            data = {
                'category': category,
                'city': city_name,
                'name': name,
                'region': DistrictName,
                's_date': s_date,
                'fanggugu_esf_price': UnitPrice,
            }
            print(data)
            save_coll.update_one({'region': DistrictName, 'city': city_name, 'name': name}, {'$set': data}, upsert=True)
        except Exception as e:
            log.info(i)
Ejemplo n.º 9
0
def analyzer():
    res = requests.get('http://114.80.150.196:8002/latestLog?')
    info_json = res.json()
    for i in info_json:
        url = i['url']
        house_id = i['id']
        if host in url:
            name_url_encode = re.search('query=(.*?)\&', url,
                                        re.S | re.M).group(1)
            name = urllib.parse.unquote(name_url_encode)
            r = requests.get(
                'http://114.80.150.196:8002/fetchBody?id={}'.format(house_id))
            try:
                r_info = r.json()['resBody']
                j = json.loads(r_info)
                result, city = standard_city(j['city'])
                if result:
                    collection.update({
                        'city': city,
                        'name': name
                    }, {'$set': {
                        'didi': j
                    }})
                    print(city, name)
            except Exception as e:
                print('-')
Ejemplo n.º 10
0
 async def standar_address(self, company):
     address = company['address']
     result, real_city = standard_city(address)
     if result:
         company['fj_city'] = real_city
         r, real_region = standard_region(real_city, address)
         if r:
             company['fj_region'] = real_region
         else:
             company['fj_region'] = None
     else:
         company['fj_city'] = None
         company['fj_region'] = None
     return company
Ejemplo n.º 11
0
def start():
    for i in coll_zhugefang.find():
        try:
            if i['price'] == 0:
                i['price'] = '0'
            i['price'] = i['price'].strip()
            i['city'] = standard_city(i['city'])
            i['comm_addr'] = standard_block(i['comm_addr']).strip()
            print(i)
            if not i['comm_addr']:
                continue
            coll_save.insert_one(i)
        except Exception as e:
            log.info(i)
Ejemplo n.º 12
0
def update_51job_fields():
    companys = collection.find({'company_source':'51job'},no_cursor_timeout=True)
    for company in companys[3376200:]:
        address = company['address']
        result, real_city = standard_city(address)
        if result:
            company['fj_city'] = real_city
            r, real_region = standard_region(real_city, address)
            if r:
                company['fj_region'] = real_region
            else:
                company['fj_region'] = None
        else:
            company['fj_city'] = None
            company['fj_region'] = None
        collection.update_one({'company_id':company['company_id'],'company_source':company['company_source']},{'$set':company})
        print('{}已经更新了'.format(company['company_id']))
Ejemplo n.º 13
0
def analyse_city(proxies):
    a_list = send_url(proxies=proxies)
    real_city_list = []
    no_city_list = []
    for a in a_list:
        city_dict = {}
        city_url = a.xpath('@href')[0]
        city_name = a.xpath('text()')[0]
        result, real_city = standard_city(city_name)
        if result:
            city_dict[city_name] = city_url
            real_city_list.append(city_dict)
        else:
            city_dict[city_name] = city_url
            no_city_list.append(city_dict)

    print(len(real_city_list))
    print(real_city_list)
    print(len(no_city_list))
    print(no_city_list)
Ejemplo n.º 14
0
def crawler_baike():
    for city in city_list:
        print(city)
        i = urllib.parse.quote(city)
        url = 'https://baike.baidu.com/item/' + i
        res = requests.get(url=url, headers=headers)
        html = res.content.decode('UTF-8', 'ignore')

        # 中文名称
        try:
            chinese_name = re.search(r'中文名称</dt>(.*?)<dd(.*?)>(.*?)</dd>',
                                     html, re.S | re.M).group(3).strip()
            chinese_name = re.sub('<[^>]+>', '', chinese_name).strip()
        except Exception as e:
            chinese_name = None

        # 外文名称
        try:
            foreign_names = re.search(r'外文名称</dt>(.*?)<dd(.*?)>(.*?)</dd>',
                                      html, re.S | re.M).group(3).strip()
            foreign_names = re.sub('<[^>]+>', '', foreign_names).strip()
        except Exception as e:
            foreign_names = None

        # 别名
        try:
            alias = re.search(
                r'别&nbsp;&nbsp;&nbsp;&nbsp;名</dt>.*?<dd.*?>(.*?)</dd>', html,
                re.S | re.M).group(1).strip()
            alias = re.sub('<[^>]+>', '', alias).strip()
        except Exception as e:
            alias = None

        # 行政区划(Administrative_categories)
        try:
            administrative_division = re.search(
                r'行政区类别</dt>.*?<dd.*?>(.*?)</dd>', html,
                re.S | re.M).group(1).strip()
            administrative_division = re.sub('<[^>]+>', '',
                                             administrative_division).strip()
        except Exception as e:
            administrative_division = None

        # 所属地区(Attribution_area)
        try:
            affiliating_area = re.search(r'所属地区</dt>.*?<dd.*?>(.*?)</dd>',
                                         html, re.S | re.M).group(1).strip()
            affiliating_area = re.sub('<[^>]+>', '', affiliating_area).strip()
        except Exception as e:
            affiliating_area = None

        # 下辖地区(governs_area)
        try:
            governs_area = re.search(r'下辖地区</dt>.*?<dd.*?>(.*?)</dd>', html,
                                     re.S | re.M).group(1).strip()
            governs_area = re.sub('<[^>]+>', '', governs_area).strip()
        except Exception as e:
            governs_area = None

        # 政府驻地
        try:
            government_resident = re.search(r'政府驻地</dt>.*?<dd.*?>(.*?)</dd>',
                                            html,
                                            re.S | re.M).group(1).strip()
            government_resident = re.sub('<[^>]+>', '',
                                         government_resident).strip()
        except Exception as e:
            government_resident = None

        # 电话区号(Telephone_code)
        try:
            area_code = re.search(r'电话区号</dt>.*?<dd.*?>(.*?)</dd>', html,
                                  re.S | re.M).group(1).strip()
            area_code = re.sub('<[^>]+>', '', area_code).strip()
        except Exception as e:
            area_code = None

        # 邮政区码
        try:
            zip_code = re.search(r'邮政区码</dt>.*?<dd.*?>(.*?)</dd>', html,
                                 re.S | re.M).group(1).strip()
            zip_code = re.sub('<[^>]+>', '', zip_code).strip()
        except Exception as e:
            zip_code = None

        # 地理位置(geographical_position)
        try:
            geographic_position = re.search(r'地理位置</dt>.*?<dd.*?>(.*?)</dd>',
                                            html,
                                            re.S | re.M).group(1).strip()
            geographic_position = re.sub('<[^>]+>', '',
                                         geographic_position).strip()
        except Exception as e:
            geographic_position = None

        # 面积
        try:
            area = re.search(
                r'面&nbsp;&nbsp;&nbsp;&nbsp;积</dt>.*?<dd.*?>(.*?)</dd>', html,
                re.S | re.M).group(1).strip()
            area = re.sub('<[^>]+>', '', area).strip()
        except Exception as e:
            area = None

        # 人口
        try:
            population = re.search(
                r'人&nbsp;&nbsp;&nbsp;&nbsp;口</dt>.*?<dd.*?>(.*?)<', html,
                re.S | re.M).group(1).strip()
            population = re.sub('<[^>]+>', '', population).strip()
        except Exception as e:
            population = None

        # 方言
        try:
            localism = re.search(
                r'方&nbsp;&nbsp;&nbsp;&nbsp;言</dt>.*?<dd.*?>(.*?)</dd>', html,
                re.S | re.M).group(1).strip()
            localism = re.sub('<[^>]+>', '', localism).strip()
        except Exception as e:
            localism = None

        # 气候条件(Climatic_conditions)
        try:
            weather_conditions = re.search(r'气候条件</dt>.*?<dd.*?>(.*?)</dd>',
                                           html, re.S | re.M).group(1).strip()
            weather_conditions = re.sub('<[^>]+>', '',
                                        weather_conditions).strip()
        except Exception as e:
            weather_conditions = None

        # 著名景点
        try:
            famous_scenery = re.search(r'著名景点</dt>.*?<dd.*?>(.*?)</dd>', html,
                                       re.S | re.M).group(1).strip()
            famous_scenery = re.sub('<[^>]+>', '', famous_scenery).strip()
        except Exception as e:
            famous_scenery = None

        # 机场
        try:
            airport = re.search(
                r'机&nbsp;&nbsp;&nbsp;&nbsp;场</dt>.*?<dd.*?>(.*?)</dd>', html,
                re.S | re.M).group(1).strip()
            airport = re.sub('<[^>]+>', '', airport).strip()
        except Exception as e:
            airport = None

        # 火车站
        try:
            railway_station = re.search(r'火车站</dt>.*?<dd.*?>(.*?)</dd>', html,
                                        re.S | re.M).group(1).strip()
            railway_station = re.sub('<[^>]+>', '', railway_station).strip()
        except Exception as e:
            railway_station = None

        # 车牌代码
        try:
            license_code = re.search(r'车牌代码</dt>.*?<dd.*?>(.*?)</dd>', html,
                                     re.S | re.M).group(1).strip()
            license_code = re.sub('<[^>]+>', '', license_code).strip()
        except Exception as e:
            license_code = None

        # 地区生产总值(Gross regional product)
        try:
            GRP = re.search(r'地区生产总值</dt>.*?<dd.*?>(.*?)</dd>', html,
                            re.S | re.M).group(1).strip()
            GRP = re.sub('<[^>]+>', '', GRP).strip()
        except Exception as e:
            GRP = None
        # 人均生产总值
        try:
            GNPP = re.search(r'人均生产总值</dt>.*?<dd.*?>(.*?)</dd>', html,
                             re.S | re.M).group(1).strip()
            GNPP = re.sub('<[^>]+>', '', GNPP).strip()
        except Exception as e:
            GNPP = None
        # 人均支配收入(Per capita income)
        try:
            per_capita_income = re.search(r'人均支配收入</dt>.*?<dd.*?>(.*?)</dd',
                                          html, re.S | re.M).group(1).strip()
            per_capita_income = re.sub('<[^>]+>', '',
                                       per_capita_income).strip()
        except Exception as e:
            per_capita_income = None
        # 消费品零售额
        try:
            retail_sales_of_consumer_goods = re.search(
                r'消费品零售额</dt>.*?<dd.*?>(.*?)<sup', html,
                re.S | re.M).group(1).strip()
            retail_sales_of_consumer_goods = re.sub(
                '<[^>]+>', '', retail_sales_of_consumer_goods).strip()
        except Exception as e:
            retail_sales_of_consumer_goods = None

        # 住户存款总额
        try:
            total_household_deposits = re.search(
                r'住户存款总额</dt>.*?<dd.*?>(.*?)<sup', html,
                re.S | re.M).group(1).strip()
            total_household_deposits = re.sub(
                '<[^>]+>', '', total_household_deposits).strip()
        except Exception as e:
            total_household_deposits = None
        # 市树市花
        try:
            were_flower = re.search(r'市树市花</dt>.*?<dd.*?>(.*?)</dd>', html,
                                    re.S | re.M).group(1).strip()
            were_flower = re.sub('<[^>]+>', '', were_flower).strip()
        except Exception as e:
            were_flower = None
        # 著名高校
        try:
            famous_universities = re.search(r'著名高校</dt>.*?<dd.*?>(.*?)</dd>',
                                            html, re.S | re.M)
            if not famous_universities:
                famous_universities = re.search(
                    r'学&nbsp;&nbsp;&nbsp;&nbsp;校</dt>.*?<dd.*?>(.*?)</dd>',
                    html, re.S | re.M)
            if not famous_universities:
                famous_universities = re.search(
                    r'高等院校</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M)
            if not famous_universities:
                famous_universities = re.search(
                    r'重点高校</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M)
            if not famous_universities:
                famous_universities = re.search(
                    r'高等学府</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M)
            if not famous_universities:
                famous_universities = re.search(
                    r'著名学府</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M)
            if not famous_universities:
                famous_universities = re.search(
                    r'高&nbsp;&nbsp;&nbsp;&nbsp;校</dt>.*?<dd.*?>(.*?)</dd>',
                    html, re.S | re.M)
            if not famous_universities:
                famous_universities = re.search(
                    r'主要高校</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M)
            if not famous_universities:
                famous_universities = re.search(
                    r'大&nbsp;&nbsp;&nbsp;&nbsp;学</dt>.*?<dd.*?>(.*?)</dd>',
                    html, re.S | re.M)
            if not famous_universities:
                famous_universities = re.search(
                    r'知名高校</dt>.*?<dd.*?>(.*?)</dd>', html, re.S | re.M)
            famous_universities = famous_universities.group(1).strip()
            famous_universities = re.sub('<[^>]+>', '',
                                         famous_universities).strip()
        except Exception as e:
            famous_universities = None

        # 市长
        try:
            mayor = re.search(
                '市&nbsp;&nbsp;&nbsp;&nbsp;长</dt>.*?<dd.*?>(.*?)</dd>', html,
                re.S | re.M).group(1)
            mayor = re.sub('<[^>]+>', '', mayor).strip()
        except Exception as e:
            mayor = None
        # 行政代码
        try:
            administrative_code = re.search('行政代码</dt.*?<dd.*?>(.*?)</dd>',
                                            html, re.S | re.M).group(1)
            administrative_code = re.sub('<[^>]+>', '',
                                         administrative_code).strip()
        except Exception as e:
            administrative_code = None
        # 城市精神
        try:
            city_spirit = re.search('城市精神</dt>.*?<dd.*?>(.*?)</dd>', html,
                                    re.S | re.M).group(1).strip()
            city_spirit = re.sub('<[^>]+>', '', city_spirit).strip()
        except Exception as e:
            city_spirit = None
        # 人类发展指数
        try:
            human_development_index = re.search(
                '人类发展指数</dt>.*?<dd.*?>(.*?)</dd>', html,
                re.S | re.M).group(1).strip()
            human_development_index = re.sub('<[^>]+>', '',
                                             human_development_index).strip()
        except Exception as e:
            human_development_index = None
        # 城市简称
        try:
            city_abbreviation = re.search('城市简称</dt>.*?<dd.*?>(.*?)</dd>',
                                          html, re.S | re.M).group(1).strip()
            city_abbreviation = re.sub('<[^>]+>', '',
                                       city_abbreviation).strip()
        except Exception as e:
            city_abbreviation = None
        is_true, city = standard_city(city)
        if not is_true:
            print(city)
        data = {
            'chinese_name': chinese_name,
            'foreign_names': foreign_names,
            'alias': alias,
            'administrative_division': administrative_division,
            'affiliating_area': affiliating_area,
            'governs_area': governs_area,
            'government_resident': government_resident,
            'area_code': area_code,
            'zip_code': zip_code,
            'geographic_position': geographic_position,
            'area': area,
            'population': population,
            'famous_scenery': famous_scenery,
            'localism': localism,
            'weather_conditions': weather_conditions,
            'airport': airport,
            'railway_station': railway_station,
            'license_code': license_code,
            'GRP': GRP,
            'GNPP': GNPP,
            'per_capita_income': per_capita_income,
            'retail_sales_of_consumer_goods': retail_sales_of_consumer_goods,
            'total_household_deposits': total_household_deposits,
            'were_flower': were_flower,
            'famous_universities': famous_universities,
            'mayor': mayor,
            'administrative_code': administrative_code,
            'city_spirit': city_spirit,
            'human_development_index': human_development_index,
            'city_abbreviation': city_abbreviation,
            'city': city,
            'update_time': datetime.now()
        }
        for i in data:
            try:
                data[i] = data[i].replace('\n', '').replace('&nbsp;', '')
                data[i] = re.sub('\[\d+\]', '', data[i])
            except Exception as e:
                pass
        print(data)
        coll.update_one({'city': city}, {'$set': data}, True)