def get_keys(__line): country_key_set = set() additional_key = ADDITIONAL_COUNTRY_LIST.get(__line['id'], None) if additional_key is not None: country_key_set.add(key_modify(additional_key)) for key in COUNTRY_KEYS: if is_legal(__line[key]): country_key_set.add(key_modify(__line[key])) for key in COUNTRY_MULTI_KEYS: if __line[key]: for word in __line[key].strip().split(MULTI_SPLIT_KEY): if is_legal(word): country_key_set.add(key_modify(word)) for country in country_key_set: yield country
def get_keys(self, _line): country_key_set = set() city_key_set = set() region_key_set = set() additional_key = ADDITIONAL_COUNTRY_LIST.get(_line['country_id'], None) if additional_key is not None: country_key_set.add(key_modify(additional_key)) for key in COUNTRY_KEYS: if is_legal(_line[key]): country_key_set.add(key_modify(_line[key])) for key in COUNTRY_MULTI_KEYS: if _line[key]: for word in _line[key].strip().split(MULTI_SPLIT_KEY): if is_legal(word): country_key_set.add(key_modify(word)) if NEED_REGION: for key in REGION_KEY: if is_legal(_line[key]): region_key_set.add(key_modify(_line[key])) for key in CITY_KEYS: if is_legal(_line[key]): city_key_set.add(key_modify(_line[key])) for key in CITY_MULTI_KEYS: if _line[key]: for word in _line[key].strip().split(MULTI_SPLIT_KEY): if is_legal(word): city_key_set.add(key_modify(word)) # 保存 city_info 以便查询 self.city_info_dict[_line[ 'id']] = 'CityId ({3}) Country ({0}) Region ({1}) City ({2})'.format( ', '.join(country_key_set), ', '.join(region_key_set), ', '.join(city_key_set), _line['id']) if KEY_CONTENT == 'both': if NEED_REGION: for country in country_key_set: for region in region_key_set: for city in city_key_set: if KEY_TYPE == 'tuple': yield country, region, city elif KEY_TYPE == 'str': yield STR_KEY_SPLIT_WORD.join( [country, region, city]) else: raise TypeError('未知分割类型,当前支持 str, tuple') for country in country_key_set: for city in city_key_set: if NEED_REGION: if len(region_key_set) > 0: self.can_use_region[(country, city)] = True if KEY_TYPE == 'tuple': yield country, city elif KEY_TYPE == 'str': yield STR_KEY_SPLIT_WORD.join([country, city]) else: raise TypeError('未知分割类型,当前支持 str, tuple') elif KEY_CONTENT == 'city': for city in city_key_set: yield city else: raise TypeError('未知 key 内容设置')
if __name__ == '__main__': data = [] count = 0 collections.remove() for line in mongo_find_iter(src_collections): count += 1 if count % 1000 == 0: print('Now', count) source_id = line['parent_info']['id'] pdf_url_list = line['pdf_url'] img_url_list = line['img_url'] for img_url in img_url_list: if is_legal(img_url): if 'logo' not in img_url.lower(): modified_url = modify_url(img_url) if modified_url: args = { 'mid': source_id, 'type': 'img', 'source_url': modified_url } data.append({ 'args': args, 'task_token': get_token(args), 'used_times': 0, 'finished': 0, 'utime': datetime.datetime.now()