class FundMapping(object): def __init__(self): self._collection = Mongodb('192.168.250.200', 27017, 'fund', 'base_fund') self._url = 'http://fund.csrc.gov.cn/web/classification_show.organization' def get_fund_mapping(self): # sub_code, sub_name, main_code, main_name sub_to_main_mapping = [] html = requests.get(self._url, timeout=30.0).content document = PyQuery(unicode(html, 'utf-8')) fund_blocks = [document.items('.aa'), document.items('.dd')] for each_block in fund_blocks: for class_tag in each_block: items_list = [item.text() for item in class_tag.items('td')] sub_to_main_mapping.append((items_list[1], items_list[3])) return dict(sub_to_main_mapping) def update_to_mongo(self): fund_mapping = self.get_fund_mapping() for item in self._collection.query(kwargs={'code': 1}).sort([('_id', 1)]): key = item['code'][:6] main_fund_code = fund_mapping.get(key) if main_fund_code is not None: regex = re.compile(r'{0}'.format(main_fund_code)) main_fund_sid = self._collection.get({'code': regex}, {'sid': 1}) print 'main:', main_fund_sid _main = (main_fund_sid or {}).get('sid', '') self._collection.update({'_id': item['_id']}, setdata={'main': _main})
def remove_by_url(): """ 删除208 sum 字段有影响的文本记录, 通过 handle_163_text函数产生的文件中的url """ count = 0 verbose_dicts = {} files_path = 'D:/temp/data/20151119/' coll_208 = Mongodb('192.168.250.208', 27017, 'news', 'topnews_analyse') for _docs in coll_208.query({'url': { '$regex': re.compile(r'163\.com') }}, {'url': 1}): verbose_dicts[_docs['url']] = _docs['_id'] keys_set = {key for key in verbose_dicts} for _c, filename in enumerate(os.listdir(files_path), 1): with open(files_path + filename) as fp: url = fp.readline().strip() if url in keys_set: count += 1 object_id = verbose_dicts[url] coll_208.update({'_id': object_id}, setdata={'stat': 0}) print('Order: {}, count: {}, id: {}'.format( _c, count, object_id)) time.sleep(0.4) # break coll_208.disconnect()
def remove_otc(): excel_path = 'D:/test/need_delete.xlsx' workbook = XlsxReader(excel_path) coll = Mongodb('122.144.134.95', 27017, 'news', 'announcement_otc') for ind, doc in enumerate(workbook.collection(_id=str), 1): # print doc['_id'], ind if ind >= 0: print doc['_id'], ind coll.update({'_id': ObjectId(doc['_id'])}, setdata={ 'stat': 0, 'upt': datetime.now() }) # break coll.disconnect()