Example #1
0
    def get_keys(__line):
        country_key_set = set()

        additional_key = ADDITIONAL_COUNTRY_LIST.get(__line['id'], None)
        if additional_key is not None:
            country_key_set.add(key_modify(additional_key))

        for key in COUNTRY_KEYS:
            if is_legal(__line[key]):
                country_key_set.add(key_modify(__line[key]))

        for key in COUNTRY_MULTI_KEYS:
            if __line[key]:
                for word in __line[key].strip().split(MULTI_SPLIT_KEY):
                    if is_legal(word):
                        country_key_set.add(key_modify(word))

        for country in country_key_set:
            yield country
    def get_keys(self, _line):
        country_key_set = set()
        city_key_set = set()
        region_key_set = set()

        additional_key = ADDITIONAL_COUNTRY_LIST.get(_line['country_id'], None)
        if additional_key is not None:
            country_key_set.add(key_modify(additional_key))

        for key in COUNTRY_KEYS:
            if is_legal(_line[key]):
                country_key_set.add(key_modify(_line[key]))

        for key in COUNTRY_MULTI_KEYS:
            if _line[key]:
                for word in _line[key].strip().split(MULTI_SPLIT_KEY):
                    if is_legal(word):
                        country_key_set.add(key_modify(word))

        if NEED_REGION:
            for key in REGION_KEY:
                if is_legal(_line[key]):
                    region_key_set.add(key_modify(_line[key]))

        for key in CITY_KEYS:
            if is_legal(_line[key]):
                city_key_set.add(key_modify(_line[key]))

        for key in CITY_MULTI_KEYS:
            if _line[key]:
                for word in _line[key].strip().split(MULTI_SPLIT_KEY):
                    if is_legal(word):
                        city_key_set.add(key_modify(word))

        # 保存 city_info 以便查询
        self.city_info_dict[_line[
            'id']] = 'CityId ({3}) Country ({0}) Region ({1}) City ({2})'.format(
                ', '.join(country_key_set), ', '.join(region_key_set),
                ', '.join(city_key_set), _line['id'])

        if KEY_CONTENT == 'both':
            if NEED_REGION:
                for country in country_key_set:
                    for region in region_key_set:
                        for city in city_key_set:
                            if KEY_TYPE == 'tuple':
                                yield country, region, city
                            elif KEY_TYPE == 'str':
                                yield STR_KEY_SPLIT_WORD.join(
                                    [country, region, city])
                            else:
                                raise TypeError('未知分割类型,当前支持 str, tuple')

            for country in country_key_set:
                for city in city_key_set:
                    if NEED_REGION:
                        if len(region_key_set) > 0:
                            self.can_use_region[(country, city)] = True

                    if KEY_TYPE == 'tuple':
                        yield country, city
                    elif KEY_TYPE == 'str':
                        yield STR_KEY_SPLIT_WORD.join([country, city])
                    else:
                        raise TypeError('未知分割类型,当前支持 str, tuple')

        elif KEY_CONTENT == 'city':
            for city in city_key_set:
                yield city
        else:
            raise TypeError('未知 key 内容设置')
Example #3
0

if __name__ == '__main__':
    data = []

    count = 0
    collections.remove()
    for line in mongo_find_iter(src_collections):
        count += 1
        if count % 1000 == 0:
            print('Now', count)
        source_id = line['parent_info']['id']
        pdf_url_list = line['pdf_url']
        img_url_list = line['img_url']
        for img_url in img_url_list:
            if is_legal(img_url):
                if 'logo' not in img_url.lower():
                    modified_url = modify_url(img_url)
                    if modified_url:
                        args = {
                            'mid': source_id,
                            'type': 'img',
                            'source_url': modified_url
                        }

                        data.append({
                            'args': args,
                            'task_token': get_token(args),
                            'used_times': 0,
                            'finished': 0,
                            'utime': datetime.datetime.now()