Example #1
0
    def process_item(self, item, spider):
        test = item
        con_ticket = get_mongodb('raw_ly', 'Ticket', profile='mongo-raw')
        con_vs = get_mongodb('raw_ly', 'ViewSpot', profile='mongo-raw')

        if item.name == 'ticket':
            con_ticket.update({'pid': item['pid']}, {
                '$set': {
                    'pid': item['pid'],
                    'lyId': item['lyId'],
                    'info': item['info'],
                    'stockList': item['stock_list']
                }
            },
                              upsert=True)
        elif item.name == 'ticket-delete':
            pass
            # con_ticket.remove({'lyId': item['ly_id'], 'pid': {'$nin': item['id_list']}}, multi=True)
        elif item.name == 'vs-ticket-info':
            con_vs.update(
                {'lyId': item['ly_id']},
                {'$set': {
                    'ticketInfo': item['info'],
                    'crawl': True
                }})

        return item
Example #2
0
    def populate_tasks(self):
        col_src, db_tar, col_tar = ('BaiduLocality', 'geo', 'LocalityTransfer') if self.args.type == 'mdd' else (
            'BaiduPoi', 'poi', 'ViewSpotTransfer')
        col = get_mongodb('proc_baidu', col_src, profile='mongo-raw')

        col_target = get_mongodb(db_tar, col_tar, profile='mongo')

        cursor = col.find({})
        cursor.skip(self.args.skip)
        if self.args.limit:
            cursor.limit(self.args.limit)

        for val in cursor:
            def func(entry=val):
                surl = entry['source']['baidu']['surl'] if 'surl' in entry['source']['baidu'] else ''
                self.log(u'Processing: zhName=%s, sid=%s, surl=%s' % (entry['zhName'], entry['source']['baidu']['id'],
                                                                      surl))
                self.resolve_targets(entry)
                target = col_target.find_one({'source.baidu.id': entry['source']['baidu']['id']})
                if not target:
                    target = {}

                for m in self.mergers:
                    m.process(entry, target)

                if target:
                    target['taoziEna'] = True
                    target['lxpEna'] = True
                    col_target.save(target)

            self.add_task(func)
Example #3
0
    def run(self):
        from utils.database import get_mongodb

        col = get_mongodb(self.args.db, self.args.col, profile='mongo')

        cursor = col.find({'images': {'$ne': None}}, snapshot=True)
        cursor.skip(self.args.skip)
        if self.args.limit:
            cursor.limit(self.args.limit)

        col_im = get_mongodb('imagestore', 'Images', profile='mongo')

        cursor = col.find({'images': {'$ne': None}}, {'images': 1}).sort('_id', pymongo.ASCENDING)
        cursor.skip(self.args.skip)
        if self.args.limit:
            cursor.limit(self.args.limit)

        self.total = 0
        super(ImageValidator, self).run()
        for entry in cursor:
            def func(val=entry):
                modified = False
                if 'images' not in val or not val['images']:
                    return

                for img in val['images']:
                    key = img['key']
                    match = re.search(r'[0-9a-f]{32}', key)
                    if not match:
                        continue
                    new_key = match.group()

                    # 使用new_key去imagestore中查询
                    ret = col_im.find_one({'$or': [{'key': new_key}, {'url_hash': new_key}]})
                    if not ret:
                        print 'Image not exists: %s' % key
                        continue

                    if img['key'] != new_key:
                        modified = True
                        img['key'] = new_key

                    if 'url' in img:
                        modified = True
                        img.pop('url')

                    if 'cropHint' in img:
                        ch = img['cropHint']
                        if ch['bottom'] == 0 and ch['right'] == 0:
                            modified = True
                            img.pop('cropHint')

                if modified:
                    print 'Updating %s' % val['_id']
                    col.update({'_id': val['_id']}, {'$set': {'images': val['images']}})

            self.add_task(func)
            gevent.sleep(0)

        self._join()
Example #4
0
    def parse(self, entry):
        col_vs = get_mongodb('poi', 'ViewSpot', 'mongo')
        col_dining = get_mongodb('poi', 'Restaurant', 'mongo')
        col_shopping = get_mongodb('poi', 'Shopping', 'mongo')
        poi_dbs = {
            'vs': col_vs,
            'dining': col_dining,
            'shopping': col_shopping
        }

        def fetch_poi_item(mfw_id, poi_type):
            col_poi = poi_dbs[poi_type]
            tmp = col_poi.find_one({'source.mafengwo.id': mfw_id}, {'_id': 1})
            if tmp:
                return {'type': poi_type, 'item_id': tmp['_id']}
            else:
                return None

        ret = None
        for v in ['vs', 'dining', 'shopping']:
            ret = fetch_poi_item(entry['poi_id'], v)
            if ret:
                break

        if not ret:
            return

        for item_type, item_data in self.parse_contents(entry['contents']):
            if item_type != 'image':
                item_data['source'] = {'mafengwo': {'id': entry['comment_id']}}
                item_data['type'] = ret['type']
                item_data['itemId'] = ret['item_id']

            self.update(item_type, item_data)
Example #5
0
    def build_city_map(self, refresh_redis=False):
        """
        建立从自有数据库的city到大众点评的city的映射
        """
        redis = self.engine.redis
        city_map = {}

        col_dp = get_mongodb('raw_dianping', 'City', 'mongo-raw')
        col_loc = get_mongodb('geo', 'Locality', 'mongo')
        for city_item in col_dp.find({}):
            city_name = city_item['city_name']
            redis_key = 'dianping:norm_city_%s' % city_name
            norm_city_info = None

            if refresh_redis or not redis.exists(redis_key):
                candidates = list(
                    col_loc.find({'alias': city_name}, {'_id': 1}))
                if len(candidates) > 1:
                    self.log('Duplicate cities found for %s' % city_name,
                             logging.WARN)
                elif not candidates:
                    self.log('No city found for %s' % city_name, logging.WARN)
                else:
                    norm_city_info = candidates[0]

                redis.set(redis_key, norm_city_info)
            else:
                exec 'from bson import ObjectId'
                norm_city_info = eval(redis.get(redis_key))

            if norm_city_info:
                city_id = norm_city_info['_id']
                city_map[city_id] = city_item

        self.city_map = city_map
    def look_up_vs(self):
        """
        查询匹配:先做已有库名字匹配,无则通过百度旅游suggestion匹配
        """
        conn_taozi = get_mongodb('poi', 'ViewSpot', 'mongo')
        conn_raw_ly = get_mongodb('raw_ly', 'ViewSpot', 'mongo-raw')
        coon_mapping = get_mongodb('poi', 'LyMapping', 'mongo')

        for vs in self.vs_generate():
            def map_vs(vs_info=vs):
                ly_id = int(vs_info['lyId'])
                ly_name = vs_info['lyName']
                res = conn_taozi.find_one({'alias': ly_name}, {'_id': True, 'zhName': True})
                if res is not None:
                    coon_mapping.update({'itemId': res['_id']}, {'$set': {'itemId': res['_id'], 'zhNameLxp': res['zhName'], 'zhNameLy': ly_name, 'lyId': ly_id}}, upsert=True)
                    conn_raw_ly.update({'lyId': ly_id}, {'$set': {'mapOk': True}}, upsert=False)
                else:
                    suggs = self.get_baidu_sug(ly_name, None)
                    if len(suggs):
                        target = suggs[0]  # 只选第一个
                        if target['type_code'] >= 6 and self.cal_dist(vs_info['lat'], vs_info['lng'], target['lat'], target['lng']) < 50:  # 单位 km
                            res = conn_taozi.find_one({'source.baidu.id': target['sid']}, {'_id': True, 'zhName': True})
                            if res is not None:
                                coon_mapping.update({'itemId': res['_id']}, {'$set': {'itemId': res['_id'], 'zhNameLxp': res['zhName'], 'zhNameLy': ly_name, 'lyId': ly_id, 'mapEstimated': True}}, upsert=True)
                                conn_raw_ly.update({'lyId': ly_id}, {'$set': {'mapEstimated': True}}, upsert=False)
            self.add_task(map_vs)
Example #7
0
    def sync_images(self):
        """
        同步图像编辑状态
        :return:
        """
        src_col = get_mongodb('poi', 'ViewSpot', 'mongo')
        dst_col = get_mongodb('poi', 'ViewSpot', 'mongo-cms')
        cursor = src_col.find({}, {'isDone': 1, 'images': 1})

        for val in cursor:
            def task_func(entry=val):
                ops_set = {}
                ops_unset = {}

                for key in ['isDone', 'images']:
                    if key in entry:
                        ops_set[key] = entry[key]
                    else:
                        ops_unset[key] = 1

                ops = {}
                if ops_set:
                    ops['$set'] = ops_set
                if ops_unset:
                    ops['$unset'] = ops_unset

                if ops:
                    dst_col.update({'_id': entry['_id']}, ops)

            self.add_task(task_func)
Example #8
0
    def process_item(self, item, spider):
        col_loc = get_mongodb('geo', 'Locality', profile='mongodb-general')

        # get country
        country_code = item['country_code']
        if country_code not in QyerCityProcPipeline.country_map:
            col_country = get_mongodb('geo', 'Country', profile='mongodb-general')
            country = col_country.find_one({'code': country_code})
            assert country != None
            QyerCityProcPipeline.country_map[country_code] = country
        else:
            country = QyerCityProcPipeline.country_map[country_code]

        city_id = item['city_id']
        city = col_loc.find_one({'source.qyer.id': city_id})

        if not city:
            city = col_loc.find_one({'alias': item['zh_name'].lower(),
                                     'location': {
                                         '$near': {'type': 'Point', 'coordinates': [item['lng'], item['lat']]}},
                                     'country._id': country['_id']})
            if city:
                dist = utils.haversine(city['location']['coordinates'][0], city['location']['coordinates'][1],
                                       item['lng'], item['lat'])
                if dist > 100:
                    city = {}

        if not city:
            city = {}

        city['enName'] = item['en_name']
        zh_name = item['zh_name']
        short_name = utils.get_short_loc(zh_name)
        city['zhName'] = short_name

        alias1 = city['alias'] if 'alias' in city and city['alias'] else []
        alias2 = item['alias'] if 'alias' in item and item['alias'] else []
        alias1.extend(alias2)
        alias1.append(short_name)
        city['alias'] = list(set(filter(lambda val: val, [tmp.lower().strip() for tmp in alias1])))

        source = city['source'] if 'source' in city else {}
        source['qyer'] = {'id': item['city_id'], 'url': item['url']}
        city['source'] = source
        city['country'] = {'id': country['_id'], '_id': country['_id']}
        for k in ('enName', 'zhName'):
            if k in country:
                city['country'][k] = country[k]

        city['level'] = 2
        city['desc'] = item['desc']
        city['imageList'] = item['imageList']
        city['images'] = []
        city['location'] = {'type': 'Point', 'coordinates': [item['lng'], item['lat']]}
        city['abroad'] = country_code != 'CN'
        city['isHot'] = item['is_hot'] > 0

        col_loc.save(city)

        return item
Example #9
0
    def add_image(image_url):
        from hashlib import md5

        url_hash = md5(image_url).hexdigest()
        image = {'url_hash': url_hash, 'key': url_hash, 'url': image_url}
        col_im = get_mongodb('imagestore', 'Images', 'mongo')
        if not col_im.find_one({'key': image['key']}, {'_id': 1}):
            col = get_mongodb('imagestore', 'ImageCandidates', 'mongo')
            col.update({'key': image['key']}, {'$set': image}, upsert=True)
        return image['key']
Example #10
0
    def populate_tasks(self):
        from urllib import quote

        col_raw1 = get_mongodb('raw_baidu', 'BaiduPoi', 'mongo-raw')
        col_raw2 = get_mongodb('raw_baidu', 'BaiduLocality', 'mongo-raw')

        col = get_mongodb('raw_mfw', 'MfwSug', 'mongo-raw')

        query = json.loads(self.args.query) if self.args.query else {}

        for col_raw in [col_raw1, col_raw2]:
            cursor = col_raw.find(query, {
                'ambiguity_sname': 1,
                'sname': 1,
                'sid': 1
            }).skip(self.args.skip)
            if self.args.limit:
                cursor.limit(self.args.limit)

            for val in cursor:

                def func(entry=val):

                    for name in set(
                            filter(lambda v: v.strip(), [
                                entry[k] for k in ['ambiguity_sname', 'sname']
                            ])):
                        self.log(u'Parsing: %s, id=%s' % (name, entry['sid']))

                        url = 'http://www.mafengwo.cn/group/ss.php?callback=j&key=%s' % quote(
                            name.encode('utf-8'))
                        key = md5(url).hexdigest()

                        if col.find_one({'key': key}, {'_id': 1}):
                            # The record already exists
                            self.log(u'Already exists, skipping: %s, id=%s' %
                                     (name, entry['sid']))
                            continue

                        response = self.request.get(url)
                        if not response:
                            self.log(
                                u'Failed to query url: %s, %s, id=%s' %
                                (url, name, entry['sid']), logging.ERROR)
                            continue

                        col.update({'key': key}, {
                            'key': key,
                            'body': response.text,
                            'name': name,
                            'url': url
                        },
                                   upsert=True)

                self.add_task(func)
Example #11
0
    def parse(self, response):
        col = get_mongodb('raw_data', 'GeoNames', profile='mongodb-crawler')
        countries = response.meta['country']

        query = {'featureClass': 'P', 'population': {'$gt': 0}}
        if countries:
            if len(countries) > 1:
                query['$or'] = [{'country': tmp.upper()} for tmp in countries]
            else:
                query['country'] = countries[0].upper()
        for entry in col.find(query):
            # city = col.find_one({'_id': entry['_id']})
            city = entry

            item = CityItem()
            item['city_id'] = city['_id']
            item['en_name'] = city['asciiName']
            item['zh_name'] = city['enName']

            item['lat'] = city['lat']
            item['lng'] = city['lng']
            item['population'] = city['population']
            item['level'] = city['featureCode']

            s = set([tmp.lower().strip() for tmp in (item['alias'] if 'alias' in city else [])])
            s.add(city['asciiName'].lower())
            s.add(city['enName'].lower())
            for val in city['altName']:
                s.add(val.lower())
            item['alias'] = list(s)

            country_code = city['country']
            item['country_code'] = country_code
            if country_code in GeoNamesProcSpider.country_map:
                country = GeoNamesProcSpider.country_map[country_code]
            elif country_code not in GeoNamesProcSpider.missed_countries:
                col_country = get_mongodb('geo', 'Country', profile='mongodb-general')
                country = col_country.find_one({'code': country_code})
                if not country:
                    self.log('MISSED COUNTRY: %s' % country_code, log.WARNING)
                    GeoNamesProcSpider.missed_countries.add(country_code)
                    continue
                else:
                    GeoNamesProcSpider.country_map[country_code] = country
            else:
                continue
            item['en_country'] = country['enName'] if 'enName' in country else None
            item['zh_country'] = country['zhName'] if 'zhName' in country else None

            yield Request(url='http://maps.googleapis.com/maps/api/geocode/json?address=%s,%s&sensor=false' % (
                item['en_name'], item['en_country']), callback=self.parse_geocode, meta={'item': item, 'lang': 'zh'},
                          headers={'Accept-Language': 'zh-CN'}, dont_filter=True)
Example #12
0
    def process_item(self, item, spider):
        if not self.is_handler(item, spider):
            return item

        data = item['data']
        item_type = item['type']

        if item_type == 'question':
            col = get_mongodb('raw', 'MafengwoQuestion', 'mongo-raw')
            col.update({'q_id': data['q_id']}, {'$set': data}, upsert=True)
        else:
            col = get_mongodb('raw', 'MafengwoAnswer', 'mongo-raw')
            col.update({'a_id': data['a_id']}, {'$set': data}, upsert=True)
        return item
Example #13
0
    def resolve_targets(item):
        data = item['data']

        col_mdd = get_mongodb('geo', 'Locality', 'mongodb-general')
        col_country = get_mongodb('geo', 'Country', 'mongodb-general')

        country_flag = False
        crumb_list = data.pop('crumbIds')
        crumb = []
        for cid in crumb_list:
            ret = col_mdd.find_one({'source.mafengwo.id': cid}, {
                '_id': 1,
                'zhName': 1,
                'enName': 1
            })
            if not ret and not country_flag:
                ret = col_country.find_one({'source.mafengwo.id': cid}, {
                    '_id': 1,
                    'zhName': 1,
                    'enName': 1,
                    'code': 1
                })
                if ret:
                    # 添加到country字段
                    data['country'] = ret
                    for key in ret:
                        data['country'][key] = ret[key]
                    country_flag = True
            if ret:
                crumb.append(ret['_id'])
        data['targets'] = crumb

        # 从crumb的最后开始查找。第一个目的地即为city
        city = None
        for idx in xrange(len(crumb_list) - 1, -1, -1):
            cid = crumb_list[idx]
            ret = col_mdd.find_one({'source.mafengwo.id': cid}, {
                '_id': 1,
                'zhName': 1,
                'enName': 1
            })
            if ret:
                city = {'_id': ret['_id']}
                for key in ['zhName', 'enName']:
                    if key in ret:
                        city[key] = ret[key]
                break

        if city:
            data['locality'] = city
Example #14
0
    def resolve_targets(data):
        """
        将baidu sid解析为相应的object ID
        :param entry:
        :param data:
        :return:
        """
        if 'locList' not in data:
            return

        col_country = get_mongodb('geo', 'Country', 'mongo')
        col_mdd = get_mongodb('geo', 'Locality', 'mongo')

        def func(loc_list):
            """
            顺序查找loc_list中的项目。如果有命中的,则返回。
            :param col:
            :param loc_list:
            :return:
            """
            target_list = []
            country = None
            country_flag = True

            for item in loc_list:
                if country_flag:
                    ret = col_country.find_one({'alias': item['sname']}, {'zhName': 1, 'enName': 1})
                else:
                    ret = col_mdd.find_one({'source.baidu.id': item['sid']}, {'zhName': 1, 'enName': 1})
                if not ret:
                    continue

                if country_flag:
                    country = ret
                    country_flag = False

                target_list.append(ret)

            return country, target_list

        country, target_list = func(data.pop('locList'))
        if country:
            data['country'] = country
            data['abroad'] = country['zhName'] not in [u'中国', u'澳门', u'香港', u'台湾']
        else:
            data['abroad'] = None
        if target_list:
            data['locList'] = target_list
Example #15
0
    def process_item(self, item, spider):
        col = get_mongodb('raw_data',
                          'YahooCityInfo',
                          profile='mongodb-crawler')

        # retrieve the locality if it exists
        if 'woeid' in item:
            data = col.find_one({'woeid': item['woeid'], 'level': 2})
        else:
            data = col.find_one({
                'country': item['country'],
                'state': item['state'],
                'level': 1
            })
        if not data:
            data = {}

        for k in [
                'country', 'state', 'city', 'abroad', 'coords', 'woeid',
                'level'
        ]:
            if k in item:
                data[k] = item[k]

        col.save(data)

        return item
Example #16
0
    def parse(self, response):
        meta = response.meta
        col_raw = get_mongodb('raw_data',
                              'QyerSpot',
                              profile='mongodb-crawler')

        for country in meta['countries']:
            # 查找指定国家的POI
            for entry in col_raw.find(
                {'country_info.country_engname': country}):
                lat = entry['poi_lat']
                lng = entry['poi_lng']

                if not lat or not lng:
                    continue

                item = QyerPoiItem()
                for k in entry:
                    if k in item.fields:
                        item[k] = entry[k]

                # 这一步是为了获得poi所在城市
                url = 'http://maps.googleapis.com/maps/api/geocode/json?address=%f,%f' % (
                    lat, lng)
                yield Request(url=url,
                              meta={'item': item},
                              callback=self.parse_geocode,
                              dont_filter=True)
Example #17
0
    def process_item(self, item, spider):
        test = item
        con_ticket = get_mongodb('raw_ly', 'Ticket', profile='mongo-raw')
        con_vs = get_mongodb('raw_ly', 'ViewSpot', profile='mongo-raw')

        if item.name == 'ticket':
            con_ticket.update({'pid': item['pid']},
                              {'$set': {'pid': item['pid'], 'lyId': item['lyId'], 'info': item['info'], 'stockList': item['stock_list']}},
                               upsert=True)
        elif item.name == 'ticket-delete':
            pass
            # con_ticket.remove({'lyId': item['ly_id'], 'pid': {'$nin': item['id_list']}}, multi=True)
        elif item.name == 'vs-ticket-info':
            con_vs.update({'lyId': item['ly_id']}, {'$set': {'ticketInfo': item['info'], 'crawl': True}})

        return item
Example #18
0
    def crawl_vs(self):
        """
        crawl vs with ly api through city iteration
        :return:
        """
        conn = get_mongodb('raw_ly', 'ViewSpot', 'mongo-raw')
        city_list = self.crawl_city()
        self.logger.info('-=-=-=-=length: %s' % len(city_list))
        for ct in city_list:
            def func(city=ct):
                self.logger.info('================%s==============' % city)
                if int(city['location_id']) <= 35:
                    query_obj = {'clientIp': '127.0.0.1', 'provinceId': int(city['location_id'])}
                elif int(city['location_id']) <= 404:
                    query_obj = {'clientIp': '127.0.0.1', 'cityId': int(city['location_id'])}
                else:
                    query_obj = {'clientIp': '127.0.0.1', 'countryId': int(city['location_id'])}
                raw_xml = self.scenerylist().send_request(query_obj)
                node = etree.fromstring(raw_xml)
                responce_code = node.xpath('//rspCode/text()')[0]
                if '0000' == str(responce_code):
                    total_page = node.xpath('//sceneryList')[0].attrib['totalPage']
                    for page in xrange(int(total_page)):
                        temp_query = copy.deepcopy(query_obj)
                        temp_query['page'] = page + 1
                        raw_xml = self.scenerylist().send_request(temp_query)
                        node = etree.fromstring(raw_xml)
                        vs_nodes = node.xpath('//sceneryName')
                        for vs in vs_nodes:
                            name = vs.text.encode('utf-8')
                            ly_id = int(vs.xpath('../sceneryId')[0].text)
                            self.logger.info('----%s-----%s' % (name, ly_id))
                            conn.update({'lyId': ly_id}, {'$set': {'lyId': ly_id, 'lyName': name}}, upsert=True)

            self.add_task(func)
Example #19
0
    def build_cursor(self):
        col = get_mongodb('poi', 'Restaurant', 'mongo')
        query = {
            'source.dianping.id': None,
            'locality._id': {
                '$in': self.city_map.keys()
            }
        }
        if self.args.query:
            exec 'from bson import ObjectId'
            extra_query = eval(self.args.query)
        else:
            extra_query = {}
        if extra_query:
            query = {'$and': [query, extra_query]}

        cursor = col.find(query, {
            'locality': 1,
            'zhName': 1,
            'alias': 1,
            'location': 1
        }).skip(self.args.skip)
        if self.args.limit:
            cursor.limit(self.args.limit)
        return cursor
Example #20
0
    def process(self, entry):
        qunar_id = entry['source']['qunar']['id']

        image_list_url = 'http://travel.qunar.com/place/api/poi/image?offset=0&limit=1000&poiId=%d' % qunar_id
        self.logger.debug('Processing poi: %d, url: %s' %
                          (qunar_id, image_list_url))

        try:
            validators = [qunar_validator, qunar_json_validator]
            response = self.request.get(
                image_list_url,
                user_data={'ProxyMiddleware': {
                    'validator': validators
                }})
            images = response.json()['data']
        except (IOError, ValueError, KeyError) as e:
            self.logger.warn('Failed: %s' % image_list_url)
            return

        if not images:
            return

        col_im = get_mongodb('imagestore', 'Images', 'mongo')
        col_cand = get_mongodb('imagestore', 'ImageCandidates', 'mongo')

        for idx, img_entry in enumerate(images):
            url = img_entry['url']
            key = md5(url).hexdigest()
            url_hash = key
            ord_idx = idx

            image = {
                'url': url,
                'key': key,
                'url_hash': url_hash,
                'ord': ord_idx
            }

            if img_entry['userName']:
                image['meta'] = {'userName': img_entry['userName']}

            self.logger.debug('Retrieved image: %s, url=%s, poi=%d' %
                              (key, url, qunar_id))
            ops = {'$set': image, '$addToSet': {'itemIds': entry['_id']}}
            ret = col_im.update({'url_hash': url_hash}, ops)
            if not ret['updatedExisting']:
                col_cand.update({'url_hash': url_hash}, ops, upsert=True)
Example #21
0
    def update_city(self, item):
        city_candidates = item['poi_city']
        country_info = item['country_info']
        # lookup the city
        city = None
        col_loc = get_mongodb('geo', 'Locality', profile='mongodb-general')
        for city_name in city_candidates:
            city_list = list(
                col_loc.find(
                    {
                        'country.id': country_info['_id'],
                        'alias': re.compile(r'^%s' % city_name.lower()),
                        'location': {
                            '$near': {
                                '$geometry': {
                                    'type':
                                    'Point',
                                    'coordinates':
                                    [item['poi_lng'], item['poi_lat']]
                                },
                                '$minDistance': 0,
                                '$maxDistance': 100 * 1000
                            }
                        }
                    }, {
                        'zhName': 1,
                        'enName': 1,
                        'coords': 1
                    }).limit(5))
            if city_list:
                city = city_list[0]
                break

        if not city:
            self.log(
                'Failed to find locality from DB: %s' %
                ', '.join(city_candidates), log.WARNING)
            return

        alias_names = list(
            set(
                filter(lambda val: val,
                       [(city[k].strip() if k in city and city[k] else '')
                        for k in ['zhName', 'enName']])))
        try:
            zhName = city['zhName'].strip()
        except (ValueError, KeyError, AttributeError):
            zhName = alias_names[0]
        try:
            enName = city['enName'].strip()
        except (ValueError, KeyError, AttributeError):
            enName = alias_names[0]
        item['poi_city'] = {
            'id': city['_id'],
            '_id': city['_id'],
            'zhName': zhName,
            'enName': enName
        }
        return item
Example #22
0
    def process_item(self, item, spider):
        if not self.is_handler(item, spider):
            return item

        data = item['data']
        item_type = item['type']

        # 数据库授权
        if item_type == 'question':
            col = get_mongodb('raw', 'QunarQuestion', 'mongo-raw')
            col.update({'post_id': data['post_id']}, {'$set': data}, upsert=True)
            # log.msg('note_id:%s' % data['note_id'], level=log.INFO)
        else:
            col = get_mongodb('raw', 'QunarAnswer', 'mongo-raw')
            col.update({'post_id': data['post_id']}, {'$set': data}, upsert=True)
            # log.msg('note_id:%s' % data['note_id'], level=log.INFO)
        return item
Example #23
0
    def mv_candidates(image):
        """
        将图像添加到ImageCandidates里面
        """
        from utils.database import get_mongodb

        col_cand = get_mongodb('imagestore', 'ImageCandidates', profile='mongo')
        col_img = get_mongodb('imagestore', 'Images', profile='mongo')

        image_id = image.pop('_id')

        print 'Moving %s' % image['url']
        image['url_hash'] = md5(image['url']).hexdigest()
        image['key'] = image['url_hash']
        col_cand.update({'url_hash': image['url_hash']}, {'$set': image})

        col_img.remove({'_id': image_id})
Example #24
0
 def bind_shop_id(shop, dianping_id):
     col = get_mongodb('poi', 'Restaurant', 'mongo')
     col.update({'_id': shop['_id']},
                {'$set': {
                    'source.dianping': {
                        'id': dianping_id
                    }
                }})
 def vs_generate(self):
     """
     待处理景点生成器
     """
     conn = get_mongodb('raw_ly', 'ViewSpot', 'mongo-raw')
     for entry in list(conn.find({'mapped': False}, {'lyId': 1, 'lyName': 1, 'lat': 1, 'lng': 1})):
         conn.update({'lyId': entry['lyId']}, {'$set': {'mapped': True}}, upsert=False)
         yield entry
Example #26
0
    def process_item(self, item, spider):
        if not self.is_handler(item, spider):
            return item

        data = item['data']
        item_type = item['type']

        # 数据库授权
        if item_type == 'question':
            col = get_mongodb('raw_faq', 'CtripQuestion', 'mongo-raw')
            col.update({'q_id': data['q_id']}, {'$set': data}, upsert=True)
            # log.msg('note_id:%s' % data['note_id'], level=log.INFO)
        else:
            col = get_mongodb('raw_faq', 'CtripAnswer', 'mongo-raw')
            col.update({'a_id': data['a_id']}, {'$set': data}, upsert=True)
            # log.msg('note_id:%s' % data['note_id'], level=log.INFO)
        return item
Example #27
0
    def process_item(self, item, spider):
        if type(item).__name__ != YahooCityItem.__name__:
            return item

        col_loc = get_mongodb('geo', 'Locality', profile='mongodb-general')
        data = {}

        level = item['level']

        data['zhName'] = item['zh_name']
        data['enName'] = item['en_name']
        abroad = item['abroad']
        data['abroad'] = abroad
        data['shortName'] = item['en_name' if abroad else 'zh_name']
        data['alias'] = list(set(item['alias']))
        data['pinyin'] = []

        country_info = item['country']
        data['country'] = {'id': country_info['_id'], 'zhName': country_info['zhName'],
                           'enName': country_info['enName']}

        data['level'] = level
        data['images'] = []
        if 'coords' in item:
            data['coords'] = item['coords']

        data['source'] = {'name': 'yahoo'}
        if 'woeid' in item:
            data['source']['id'] = item['woeid']

        if level > 1:
            # cities
            prov = col_loc.find_one({'country.id': country_info['_id'], 'alias': item['state'].lower(), 'level': 1})
            if prov:
                data['superAdm'] = {'id': prov['_id'], 'zhName': prov['zhName'], 'enName': prov['enName']}
            else:
                spider.log('Cannot find province: %s, %s' % (item['state'], item['en_country']))

        if 'woeid' in item:
            entry = col_loc.find_one({'source.name': 'yahoo', 'source.id': item['woeid']})
        else:
            entry = col_loc.find_one({'country.id': country_info['_id'], 'alias': data['enName'].lower()})

        if not entry:
            entry = {}

        key_set = set(data.keys()) - {'alias'}
        for k in key_set:
            entry[k] = data[k]

        if 'alias' not in entry:
            entry['alias'] = []

        entry['alias'] = list(set(entry['alias']).union(data['alias']))

        col_loc.save(entry)

        return item
Example #28
0
    def process(self, entry):
        """
        开始处理评论列表
        """
        col_raw = get_mongodb('raw_qunar', 'PoiComment', 'mongo-raw')
        tmpl = 'http://travel.qunar.com/place/api/html/comments/poi/%d?sortField=1&pageSize=%d&page=%d'
        qunar_id = entry['source']['qunar']['id']

        page = 0
        page_size = 50

        while True:
            page += 1
            comments_list_url = tmpl % (qunar_id, page_size, page)
            self.logger.debug('Fetching: poi: %d, page: %d, url: %s' %
                              (qunar_id, page, comments_list_url))

            redis_key = 'qunar:poi-comment:list:%d:%d:%d' % (qunar_id,
                                                             page_size, page)

            def get_comments_list():
                """
                获得评论列表的response body
                """
                validators = [qunar_validator, qunar_json_validator]
                response = self.request.get(
                    comments_list_url,
                    timeout=15,
                    user_data={'ProxyMiddleware': {
                        'validator': validators
                    }})
                return response.text

            try:
                comments_list_expire = 3600 * 24
                search_result_text = self.redis.get_cache(
                    redis_key, get_comments_list, expire=comments_list_expire)
                data = json.loads(search_result_text)
            except (IOError, ValueError):
                self.logger.warn('Fetching failed: %s' % comments_list_url)
                break

            if data['errmsg'] != 'success':
                self.logger.warn('Fetching failed %s, errmsg: %s' %
                                 (comments_list_url, data['errmsg']))
                break

            tmp = self.parse_comments(data['data'])
            comments = list(tmp) if tmp else []
            for c in comments:
                c['poi_id'] = qunar_id
                col_raw.update({'comment_id': c['comment_id']}, {'$set': c},
                               upsert=True)

            # 如果返回空列表,或者comments数量不足pageSize,说明已经到达最末页
            if not comments or len(comments) < page_size:
                return
Example #29
0
 def start_requests(self):  # send request
     col = get_mongodb('raw_data', 'CityInfo', profile='mongodb-crawler')  # get the collection of cityinfo
     for temp in col.find({}, {'city': 1, 'woeid': 1}):
         city = temp['city']
         woeid = temp['woeid']
         city_id = temp['_id']
         url = 'http://weather.yahooapis.com/forecastrss?w=%d&u=c' % woeid
         data = {'city_id': city_id, 'city': city, 'woeid': woeid}
         yield Request(url=url, callback=self.parse, meta={'data': data})
Example #30
0
 def process_item(self, item, spider):
     col = get_mongodb('raw_data', 'QyerSpot', profile='mongodb-crawler')
     data = col.find_one({'poi_id': item['poi_id']})
     if not data:
         data = {}
     for key in item.keys():
         data[key] = item[key]
     col.save(data)
     return item
Example #31
0
 def process_item(self, item, spider):
     col = get_mongodb('raw_data', 'QyerSpot', profile='mongodb-crawler')
     data = col.find_one({'poi_id': item['poi_id']})
     if not data:
         data = {}
     for key in item.keys():
         data[key] = item[key]
     col.save(data)
     return item
Example #32
0
    def process_item(self, item, spider):
        if not self.is_handler(item, spider):
            return item

        col = get_mongodb('misc', 'Proxy', 'mongo')
        col.update({'host': item['host'], 'port': item['port']}, {'$set': {k: item[k] for k in item.keys()}},
                   upsert=True)

        return item
Example #33
0
    def get_baidu_sug(self, name, location):

        from utils import mercator2wgs
        from urllib import quote

        url = u'http://lvyou.baidu.com/destination/ajax/sug?wd=%s&prod=lvyou_new&su_num=20' % name

        key = quote(name.encode('utf-8'))

        col = get_mongodb('raw_baidu', 'BaiduSug', 'mongo-raw')
        ret = col.find_one({'key': key}, {'body': 1})
        body = None
        if ret:
            body = ret['body']
        else:
            try:
                response = ProcessorEngine.get_instance().request.get(url)
                if response:
                    body = response.text
                    col.update({'key': key}, {
                        'key': key,
                        'body': body,
                        'url': url
                    },
                               upsert=True)
            except IOError:
                pass
        if not body:
            return []

        try:
            sug = json.loads(json.loads(body)['data']['sug'])
            result = []
            for s in sug['s']:
                tmp = re.split(r'\$', s)
                entry = {
                    'sname': tmp[0].strip(),
                    'parents': tmp[6].strip(),
                    'sid': tmp[8].strip(),
                    'surl': tmp[22].strip(),
                    'parent_sid': tmp[26].strip(),
                    'type_code': int(tmp[24])
                }

                mx = float(tmp[14])
                my = float(tmp[16])
                entry['lng'], entry['lat'] = self.bd_mc_to_ll(mx, my)
                # entry['lng'], entry['lat'] = mercator2wgs(mx, my)
                result.append(entry)

            return result
        except IOError as e:
            e.message += 'url: %s' % url
            raise e
        except (ValueError, KeyError):
            return []
Example #34
0
 def update(item_type, item_data):
     if item_type == 'comment':
         db_dict = {
             'vs': 'ViewSpotComment',
             'dining': 'DiningComment',
             'shopping': 'ShoppingComment'
         }
         db_name = db_dict[item_data.pop('type')]
         col = get_mongodb('comment', db_name, 'mongo')
         col.update(
             {'source.mafengwo.id': item_data['source']['mafengwo']['id']},
             {'$set': item_data},
             upsert=True)
     elif item_type == 'image':
         col = get_mongodb('imagestore', 'ImageCandidates', 'mongo')
         col.update({'key': item_data['key']}, {'$set': item_data},
                    upsert=True)
     else:
         assert False, 'Invalid type: %s' % item_type
Example #35
0
 def store_shops(shop_list):
     """
     将shop保存到raw_dianping数据库
     """
     col = get_mongodb('raw_dianping', 'Dining', 'mongo-raw')
     for shop in shop_list:
         ret = col.find_one({'shop_id': shop['shop_id']}, {'_id': 1})
         if not ret:
             col.update({'shop_id': shop['shop_id']}, {'$set': shop},
                        upsert=True)
Example #36
0
 def update_country(self, item):
     country_name = item['country_info']
     # lookup the country
     col_country = get_mongodb('geo', 'Country', profile='mongodb-general')
     ret = col_country.find_one({'alias': country_name.lower()}, {'zhName': 1, 'enName': 1})
     if not ret:
         self.log('Failed to find country: %s' % country_name, log.WARNING)
         return
     item['country_info'] = {'id': ret['_id'], '_id': ret['_id'], 'zhName': ret['zhName'], 'enName': ret['enName']}
     return item
Example #37
0
    def parse(self, response):
        col = get_mongodb('raw_data', 'YahooCityInfo', profile='mongodb-crawler')
        countries = response.meta['countries']
        level = response.meta['level']
        query = {'$or': [{'country.countrycode': tmp} for tmp in countries]} if countries else {}
        if level:
            query['level'] = level

        for entry in list(col.find(query, {'_id': 1})):
            city = col.find_one({'_id': entry['_id']})

            item = YahooCityItem()
            for k in ['country', 'state', 'city', 'coords', 'woeid', 'abroad', 'level']:
                if k in city:
                    item[k] = city[k]

            country_code = city['country']['countrycode']
            if country_code not in self.country_map:
                col_country = get_mongodb('geo', 'Country', profile='mongodb-general')
                country_info = col_country.find_one({'code': country_code})
                if not country_info:
                    self.log('Unable to find country: %s' % country_code, log.WARNING)
                    continue
                self.country_map[country_code] = country_info

            country_info = self.country_map[country_code]

            item['country'] = country_info
            item['en_country'] = country_info['enName']
            item['zh_country'] = country_info['zhName']
            if 'city' in city:
                item['en_name'] = city['city']
                item['zh_name'] = city['city']
            else:
                item['en_name'] = city['state']
                item['zh_name'] = city['state']

            item['alias'] = list({item['en_name'].lower()})

            yield Request(url='http://maps.googleapis.com/maps/api/geocode/json?address=%s,%s&sensor=false' % (
                item['en_name'], item['en_country']), callback=self.parse_geocode,
                          meta={'item': item, 'lang': 'zh'}, headers={'Accept-Language': 'zh-CN'}, dont_filter=True)
Example #38
0
    def parse(self, response):
        for entry in get_mongodb('raw_data', 'QyerCountry', profile='mongodb-crawler'):
            item = QyerCountryItem()
            item['country_id'] = entry['countryId']
            item['country_zh'] = entry['zhName']
            item['country_en'] = entry['enName']
            item['cont_zh'] = entry['zhContinent']
            item['cont_en'] = entry['enContinent']
            item['is_hot'] = entry['isHot']

            yield item
Example #39
0
    def check_exist(entry):
        """
        Check if an image is already processed
        """
        col_im = get_mongodb('imagestore', 'Images', 'mongo')

        url = entry['url']
        url_hash = md5(url).hexdigest()
        assert url_hash == entry['url_hash']
        ret = col_im.find_one({'url_hash': url_hash}, {'_id': 1})

        return bool(ret)
Example #40
0
    def get_poi_image(self, shop_id, page_idx=1):
        template = 'http://www.dianping.com/shop/%d/photos?pg=%d'
        album_url = template % (shop_id, page_idx)

        validators = [
            lambda v: status_code_validator(v, [200, 404]),
            lambda v: response_size_validator(v, 4096)
        ]

        response = self.request.get(
            album_url,
            timeout=15,
            user_data={'ProxyMiddleware': {
                'validator': validators
            }})

        from lxml import etree
        from hashlib import md5

        col = get_mongodb('raw_dianping', 'DianpingImage', 'mongo-raw')
        root_node = etree.fromstring(response.text, parser=etree.HTMLParser())
        for image_node in root_node.xpath(
                '//div[@class="picture-list"]/ul/li[@class="J_list"]'):
            try:
                image_title = image_node.xpath(
                    './div[@class="picture-info"]/div[@class="name"]//a[@href and @title and @onclick]/@title'
                )[0]
                if u'默认图片' in image_title:
                    continue
            except IndexError:
                continue

            try:
                image_src = image_node.xpath(
                    './div[@class="img"]/a[@href and @onclick]/img[@src and @title]/@src'
                )[0]
                pattern = re.compile(r'(/pc/[0-9a-z]{32})\(\d+[cx]\d+\)/')
                match = re.search(pattern, image_src)
                if not match:
                    continue

                image_src = re.sub(pattern, '\\1(1024c1024)/', image_src)
                key = md5(image_src).hexdigest()
                image_entry = {
                    'url_hash': key,
                    'key': key,
                    'url': image_src,
                    'shop_id': shop_id
                }

                col.update({'key': key}, {'$set': image_entry}, upsert=True)
            except IndexError:
                continue
Example #41
0
    def build_cursor(self):
        col = get_mongodb('raw_dianping', 'Dining', 'mongo-raw')

        query = {}
        if self.args.query:
            exec 'from bson import ObjectId'
            query = eval(self.args.query)

        cursor = col.find(query).skip(self.args.skip)
        if self.args.limit:
            cursor.limit(self.args.limit)
        return cursor
Example #42
0
    def crawl_vs(self):
        """
        crawl vs with ly api through city iteration
        :return:
        """
        conn = get_mongodb('raw_ly', 'ViewSpot', 'mongo-raw')
        city_list = self.crawl_city()
        self.logger.info('-=-=-=-=length: %s' % len(city_list))
        for ct in city_list:

            def func(city=ct):
                self.logger.info('================%s==============' % city)
                if int(city['location_id']) <= 35:
                    query_obj = {
                        'clientIp': '127.0.0.1',
                        'provinceId': int(city['location_id'])
                    }
                elif int(city['location_id']) <= 404:
                    query_obj = {
                        'clientIp': '127.0.0.1',
                        'cityId': int(city['location_id'])
                    }
                else:
                    query_obj = {
                        'clientIp': '127.0.0.1',
                        'countryId': int(city['location_id'])
                    }
                raw_xml = self.scenerylist().send_request(query_obj)
                node = etree.fromstring(raw_xml)
                responce_code = node.xpath('//rspCode/text()')[0]
                if '0000' == str(responce_code):
                    total_page = node.xpath(
                        '//sceneryList')[0].attrib['totalPage']
                    for page in xrange(int(total_page)):
                        temp_query = copy.deepcopy(query_obj)
                        temp_query['page'] = page + 1
                        raw_xml = self.scenerylist().send_request(temp_query)
                        node = etree.fromstring(raw_xml)
                        vs_nodes = node.xpath('//sceneryName')
                        for vs in vs_nodes:
                            name = vs.text.encode('utf-8')
                            ly_id = int(vs.xpath('../sceneryId')[0].text)
                            self.logger.info('----%s-----%s' % (name, ly_id))
                            conn.update(
                                {'lyId': ly_id},
                                {'$set': {
                                    'lyId': ly_id,
                                    'lyName': name
                                }},
                                upsert=True)

            self.add_task(func)
Example #43
0
 def start_requests(self):
     conn = get_mongodb('raw_ly', 'ViewSpot', profile='mongo-raw')
     for entry in conn.find({'crawl': False}, {'lyId': 1, 'lyName': 1}):
         scenery_id = int(entry['lyId'])
         request_body = {'clientIp': '127.0.0.1', 'sceneryIds': scenery_id, 'payType': 0}
         request_xml = self.assemble_req_xml(self.api['sceneryprice'], request_body)
         yield Request(
             url=self.url,
             method='POST',
             body=request_xml,
             headers={'Content-Type': 'application/x-www-form-urlencoded', 'X-Requested-With': 'XMLHttpRequest'},
             callback=self.parse_ticket_types,
             meta={'lyId': scenery_id})
Example #44
0
    def process_item(self, item, spider):
        if not self.is_handler(item, spider):
            return item

        data = item['data']
        poi_type = item['type']

        col_names = {'locality': 'Locality', 'attraction': 'Viewspot', 'restaurant': 'Dining', 'shopping': 'Shopping',
                     'activity': 'Activity', 'hotel': 'Hotel'}
        col = get_mongodb('raw_koubei', col_names[poi_type], 'mongo-raw')

        col.update({'id': data['id']}, {'$set': data}, upsert=True)

        return item
def zhName_to_file(filename):

    viewspot_conn = get_mongodb("poi", "ViewSpot", "mongo")
    cursor = viewspot_conn.find()

    f = open(filename, 'a')
    i = 0
    for val in cursor:
        i = i + 1
        if val.has_key('zhName'):
            s = str(val['zhName']) + '\n'
            f.write(s)
        if i % 1000 == 0:
            f.flush()
    f.close()
Example #46
0
 def process_item(self, item, spider):
     data = {}
     if 'loc' in item:
         data['loc'] = item['loc']
     if 'current_temprature' in item:
         data['current_temprature'] = item['current_temprature']
     if 'forecast' in item:
         data['forecast'] = item['forecast']
     if 'current' in item:
         data['current'] = item['current']
     if 'source' in item:
         data['source'] = item['source']
     col = get_mongodb('yahooweather', 'CityTemprature', profile=None)
     col.save(data)
     return item
Example #47
0
    def start_requests(self):  # send request
        country_list = []
        if 'param' in dir(self):
            param = getattr(self, 'param', [])
            if 'country' in param:
                country_list = param['country']

        if not country_list:
            country_list = list([tmp['code'] for tmp in get_mongodb('geo', 'Country').find({}, {'code': 1})])

        first_url = 'https://weather.yahoo.com/'
        for country_code in country_list:
            abroad = (country_code.lower() != 'cn')
            temp_url = first_url + country_code
            data = {'countrycode': country_code, 'abroad': abroad}
            yield Request(url=temp_url, callback=self.parse_state_url, meta={'data': data})
Example #48
0
    def process_item(self, item, spider):
        col = get_mongodb('raw_data', 'YahooCityInfo', profile='mongodb-crawler')

        # retrieve the locality if it exists
        if 'woeid' in item:
            data = col.find_one({'woeid': item['woeid'], 'level': 2})
        else:
            data = col.find_one({'country': item['country'], 'state': item['state'], 'level': 1})
        if not data:
            data = {}

        for k in ['country', 'state', 'city', 'abroad', 'coords', 'woeid', 'level']:
            if k in item:
                data[k] = item[k]

        col.save(data)

        return item
Example #49
0
    def process_item(self, item, spider):
        if type(item).__name__ != CityItem.__name__:
            return item

        col = get_mongodb('raw_data', 'TravelGisCity', 'localhost', 27027)

        ret = col.find_one({'name': item['city'], 'countryCode': item['code']})
        if not ret:
            ret = {}

        ret['name'] = item['city']
        ret['country'] = item['country']
        ret['countryCode'] = item['code']
        ret['lat'] = item['lat']
        ret['lng'] = item['lng']

        col.save(ret)

        return item
Example #50
0
    def process_dining_item(item, spider):
        data = item['data']

        add_set_ops = {}
        for key in ['tags']:
            elements = DianpingPipeline.add_to_set(data, key)
            if elements:
                add_set_ops[key] = {'$each': elements}

        col = get_mongodb('raw_dianping', 'Dining', 'mongo-raw')
        ops = {'$set': data}
        if add_set_ops:
            ops['$addToSet'] = add_set_ops

        from pymongo.errors import OperationFailure

        try:
            col.update({'shop_id': data['shop_id']}, ops, upsert=True)
        except OperationFailure as e:
            spider.log(e.message, level=log.ERROR)
        return item
Example #51
0
    def parse(self, response):
        meta = response.meta
        col_raw = get_mongodb('raw_data', 'QyerSpot', profile='mongodb-crawler')

        for country in meta['countries']:
            # 查找指定国家的POI
            for entry in col_raw.find({'country_info.country_engname': country}):
                lat = entry['poi_lat']
                lng = entry['poi_lng']

                if not lat or not lng:
                    continue

                item = QyerPoiItem()
                for k in entry:
                    if k in item.fields:
                        item[k] = entry[k]

                # 这一步是为了获得poi所在城市
                url = 'http://maps.googleapis.com/maps/api/geocode/json?address=%f,%f' % (lat, lng)
                yield Request(url=url, meta={'item': item}, callback=self.parse_geocode, dont_filter=True)
Example #52
0
    def process_item(self, item, spider):
        if type(item).__name__ != CityItem.__name__:
            return item

        col = get_mongodb('raw_data', 'GeoNamesCity', 'localhost', 27027)

        ret = col.find_one({'_id': item['city_id']})
        if not ret:
            ret = {}

        ret['enName'] = item['en_name']
        ret['zhName'] = item['zh_name']
        ret['countryCode'] = item['country_code']
        ret['lat'] = item['lat']
        ret['lng'] = item['lng']
        ret['population'] = item['population']
        ret['_id'] = item['city_id']

        col.save(ret)

        return item
Example #53
0
    def update_city(self, item):
        city_candidates = item['poi_city']
        country_info = item['country_info']
        # lookup the city
        city = None
        col_loc = get_mongodb('geo', 'Locality', profile='mongodb-general')
        for city_name in city_candidates:
            city_list = list(col_loc.find({'country.id': country_info['_id'],
                                           'alias': re.compile(r'^%s' % city_name.lower()),
                                           'location': {
                                               '$near': {
                                                   '$geometry': {'type': 'Point',
                                                                 'coordinates': [item['poi_lng'], item['poi_lat']]},
                                                   '$minDistance': 0,
                                                   '$maxDistance': 100 * 1000
                                               }
                                           }},
                                          {'zhName': 1, 'enName': 1, 'coords': 1}).limit(5))
            if city_list:
                city = city_list[0]
                break

        if not city:
            self.log('Failed to find locality from DB: %s' % ', '.join(city_candidates), log.WARNING)
            return

        alias_names = list(set(filter(lambda val: val, [(city[k].strip() if k in city and city[k] else '') for k in
                                                        ['zhName', 'enName']])))
        try:
            zhName = city['zhName'].strip()
        except (ValueError, KeyError, AttributeError):
            zhName = alias_names[0]
        try:
            enName = city['enName'].strip()
        except (ValueError, KeyError, AttributeError):
            enName = alias_names[0]
        item['poi_city'] = {'id': city['_id'], '_id': city['_id'], 'zhName': zhName, 'enName': enName}
        return item
Example #54
0
    def process_item(self, item, spider):
        if not isinstance(item, ChanyoujiYoujiItem):
            return item

        col = get_mongodb('raw_data', 'ChanyoujiNote1', profile='mongodb-crawler')
        note = {'noteId': item['trips_id'],
                'title': item['title'],
                'authorName': item['authorName'],
                'favorCnt': item['favorCnt'],
                'commentCnt': item['commentCnt'],
                'viewCnt': item['viewCnt'],
                'note': item['data'],
                'authorAvatar': item['authorAvatar'],
                'authorId': item['authorId']
        }
        ret = col.find_one({'noteId': note['noteId']})
        if not ret:
            ret = {}
        for k in note:
            ret[k] = note[k]
        col.save(ret)

        return item
Example #55
0
    def process_item(self, item, spider):
        if type(item).__name__ != BusStatoinItem.__name__:
            return item

        col = get_mongodb('raw_data', 'BusStation', 'localhost', 27027)

        ret = col.find_one({'stationId': item['station_id']})
        if not ret:
            ret = {}

        ret['stationId'] = item['station_id']
        ret['city'] = item['city']
        ret['province'] = item['prov']
        ret['name'] = item['name']
        if 'addr' in item and item['addr']:
            ret['addr'] = item['addr']
        if 'tel' in item and item['tel']:
            ret['tel'] = item['tel']
        if 'blat' in item and 'blng' in item and item['blat'] and item['blng']:
            ret['blat'] = item['blat']
            ret['blng'] = item['blng']
        col.save(ret)

        return item
Example #56
0
 def fetch_db_col(db, col, profile):
     return get_mongodb(db, col, profile)
Example #57
0
 def fetch_db_col(self, db, col, profile):
     sig = '%s.%s.%s' % (db, col, profile)
     if sig not in self.col_dict:
         self.col_dict[sig] = get_mongodb(db, col, profile)
     return self.col_dict[sig]
                        "type": "geo_point"
                    },
                    "type": {
                        "type": "string"
                    }
                }
            }

        }
    }
}
es_client.indices.put_mapping(type_name, viewspot_map, index_name)

bulk_data = []

viewspot_conn = get_mongodb("poi", "ViewSpot", 'mongo')
cursor = viewspot_conn.find()

i = 0
for val in cursor:
    i = i + 1
    #元数据
    bulk_data.append({
        "index": {
            "_index": index_name,
            "_type": type_name,
            "_id": i
        }
    })

    doc = {}
Example #59
0
    def process_item(self, item, spider):
        city_info = item['poi_city']
        country_info = item['country_info']

        # lookup the poi
        col_vs = get_mongodb('poi', 'ViewSpot', profile='mongodb-general')
        vs = col_vs.find_one({'source.qyer.id': item['poi_id']})
        if not vs:
            vs = {}

        source = vs['source'] if 'source' in vs else {}
        source['qyer'] = {'id': item['poi_id'], 'url': item['poi_url']}
        vs['source'] = source

        desc = vs['description'] if 'description' in vs else {}
        desc['desc'] = item['poi_summary']
        vs['description'] = desc

        vs['name'] = item['poi_name']
        vs['zhName'] = item['poi_name']
        vs['enName'] = item['poi_englishName']

        def _image_proc(url):
            m = re.search(r'^(.+pic\.qyer\.com/album/.+/index)/[0-9x]+$', url)
            return m.group(1) if m else url

        vs['imageList'] = map(_image_proc, item['poi_photo'] if 'poi_photo' in item and item['poi_photo'] else [])

        vs['country'] = country_info
        vs['city'] = city_info

        alias = filter(lambda val: val,
                       list(set([vs[k].strip().lower() if vs[k] else '' for k in ['name', 'zhName', 'enName']])))
        alias.extend(item['alias'])
        vs['alias'] = list(set(alias))
        vs['rating'] = item['rating'] if 'rating' in item else None

        vs['targets'] = [city_info['_id'], country_info['_id']]
        vs['enabled'] = True
        vs['abroad'] = True

        vs['location'] = {'type': 'Point', 'coordinates': [item['poi_lng'], item['poi_lat']]}
        if 'viewport' in item:
            vs['viewport'] = {'northeast': {'type': 'Point',
                                            'coordinates': [item['viewport']['northeast']['lng'],
                                                            item['viewport']['northeast']['lat']]},
                              'southwest': {'type': 'Point',
                                            'coordinates': [item['viewport']['southwest']['lng'],
                                                            item['viewport']['southwest']['lat']]},
            }

        details = item['poi_detail'] if 'poi_detail' in item else []
        new_det = []
        for entry in details:
            if entry['title'][:2] == u'门票':
                vs['priceDesc'] = entry['content']
            elif entry['title'][:4] == u'到达方式':
                vs['trafficInfo'] = entry['content']
            elif entry['title'][:4] == u'开放时间':
                vs['openTime'] = entry['content']
            elif entry['title'][:2] == u'地址':
                vs['address'] = entry['content']
            elif entry['title'][:2] == u'网址':
                vs['website'] = entry['content']
            elif entry['title'][:4] == u'所属分类':
                tags = set(vs['tags'] if 'tags' in vs else [])
                for t in re.split(ur'[/\||\s,]', entry['content']):
                    # for t in re.split(r'[/\|\s,]', entry['content']):
                    t = t.strip()
                    if t:
                        tags.add(t)
                vs['tags'] = list(tags)
            else:
                new_det.append(entry['title'] + entry['content'])