class FundMapping(object):
    def __init__(self):
        self._collection = Mongodb('192.168.250.200', 27017, 'fund', 'base_fund')
        self._url = 'http://fund.csrc.gov.cn/web/classification_show.organization'

    def get_fund_mapping(self):
        # sub_code, sub_name, main_code, main_name
        sub_to_main_mapping = []
        html = requests.get(self._url, timeout=30.0).content
        document = PyQuery(unicode(html, 'utf-8'))

        fund_blocks = [document.items('.aa'), document.items('.dd')]
        for each_block in fund_blocks:
            for class_tag in each_block:
                items_list = [item.text() for item in class_tag.items('td')]
                sub_to_main_mapping.append((items_list[1], items_list[3]))
        return dict(sub_to_main_mapping)

    def update_to_mongo(self):
        fund_mapping = self.get_fund_mapping()

        for item in self._collection.query(kwargs={'code': 1}).sort([('_id', 1)]):
            key = item['code'][:6]
            main_fund_code = fund_mapping.get(key)
            if main_fund_code is not None:
                regex = re.compile(r'{0}'.format(main_fund_code))
                main_fund_sid = self._collection.get({'code': regex}, {'sid': 1})
                print 'main:', main_fund_sid
                _main = (main_fund_sid or {}).get('sid', '')
                self._collection.update({'_id': item['_id']}, setdata={'main': _main})
Beispiel #2
0
def remove_by_url():
    """ 删除208 sum 字段有影响的文本记录, 通过 handle_163_text函数产生的文件中的url """
    count = 0
    verbose_dicts = {}
    files_path = 'D:/temp/data/20151119/'
    coll_208 = Mongodb('192.168.250.208', 27017, 'news', 'topnews_analyse')

    for _docs in coll_208.query({'url': {
            '$regex': re.compile(r'163\.com')
    }}, {'url': 1}):
        verbose_dicts[_docs['url']] = _docs['_id']

    keys_set = {key for key in verbose_dicts}
    for _c, filename in enumerate(os.listdir(files_path), 1):
        with open(files_path + filename) as fp:
            url = fp.readline().strip()

            if url in keys_set:
                count += 1
                object_id = verbose_dicts[url]
                coll_208.update({'_id': object_id}, setdata={'stat': 0})
                print('Order: {}, count: {}, id: {}'.format(
                    _c, count, object_id))
                time.sleep(0.4)
                # break

    coll_208.disconnect()
Beispiel #3
0
def remove_otc():
    excel_path = 'D:/test/need_delete.xlsx'
    workbook = XlsxReader(excel_path)
    coll = Mongodb('122.144.134.95', 27017, 'news', 'announcement_otc')

    for ind, doc in enumerate(workbook.collection(_id=str), 1):
        # print doc['_id'], ind

        if ind >= 0:
            print doc['_id'], ind
            coll.update({'_id': ObjectId(doc['_id'])},
                        setdata={
                            'stat': 0,
                            'upt': datetime.now()
                        })
        # break

    coll.disconnect()