def insert(self, total_data): # data order: # 公司代码 公司简称 董监高姓名 职务 股票种类 货币种类 本次变动前持股数 # 变动数 本次变动平均价格 变动后持股数 变动原因 变动日期 填报日期 number = (lambda num: num.replace(',', '')) for item in total_data: currency = data_by_table_type({'zhsname': item[5]}, [('code',)], 'curr') secu_code, orgid = data_by_table_type({'tick': item[0]}, [('code', ), ('org', 'id')], 'stock') query_name_en = {'name.szh': item[2], 'orgid': orgid} name_en, pid = data_by_table_type(query_name_en, [('name', 'en'), ('pid',)], 'exec') to_ratio, cir_ratio = ratio(number(item[7]), secu_code, 'stock', 'vary') uid_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, ''.join(item).encode('u8'))) data = { 'secu': secu_code or item[0], 'org': orgid, 'scp': {'szh': '', 'en': ''}, 'uuid': uid_uuid, 'name': {'szh': item[2], 'en': name_en}, 'relation': '', 'pid': pid, 'change': item[7], 'after': item[9], 'cur': currency, 'cause': item[10], 'cd': item[11], 'rd': item[12], 'stat': 1, 'price': item[8], 'upt': datetime.now(), 'upu': 'system', 'torat': to_ratio, 'cirrat': cir_ratio, 'typ': 'sha' } if not self._coll_in.get({'uuid': uid_uuid}, {'secu': 1}): self._coll_in.insert(data) else: print 'uuid existed:', uid_uuid self._coll_in.disconnect()
def main(self, multi_pool_page=None): assert multi_pool_page is None or isinstance(multi_pool_page, int), '`multi_pool_page` must None or int.' if multi_pool_page is None: start_page, end_page = 1, self.crawl_pages else: start_page, end_page = multi_pool_page, multi_pool_page for page in range(start_page, end_page + 1): url = self._base_url + self._query_string.format(page) secus, changes, afters, prices, scps, ncdrs = self.parse_data(url) for i in range(len(secus)): # Getting secu_code, orgid with `tick` from coll_stock table # Getting name_en, pid with `董监高姓名` from coll_exec table # Getting to_ratio, cir_ratio with `变动股份数量` and `secu_code` from coll_stock and coll_vary table # update data change_date greater than self._latest_cd_data cp_flag = self._latest_cd_data < ncdrs[i][1] secu_code, orgid = data_by_table_type({'tick': secus[i]}, [('code',), ('org', 'id')], 'stock') query_name_en__pid = {'name.szh': ncdrs[i][0], 'orgid': orgid} name_en, pid = data_by_table_type(query_name_en__pid, [('name', 'en'), ('pid',)], 'exec') to_ratio, cir_ratio = ratio(changes[i], secu_code, 'stock', 'vary') # uuid is unique identifier name = ''.join([secus[i], ncdrs[i][0], ncdrs[i][1], changes[i], prices[i], ncdrs[i][2], afters[i], scps[i].replace('\\', ''), ncdrs[i][3]]) uid_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, name)) data = { 'secu': secu_code or secus[i], 'org': orgid, 'change': changes[i], 'after': afters[i], 'price': prices[i], 'cd': ncdrs[i][1], 'cause': ncdrs[i][2], 'relation': ncdrs[i][3], 'stat': 1, 'name': {'szh': ncdrs[i][0], 'en': name_en}, 'scp': { 'szh': MarkReplace(secu_code, scps[i].replace('\\', '')).replace_mark(), 'en': data_by_table_type({'name.szh': scps[i]}, [('name', 'en')], 'exec') }, 'cur': 'CNY', 'rd': '', 'pid': pid, 'uuid': uid_uuid, 'typ': 'szx', 'upu': 'system', 'upt': datetime.now(), 'torat': to_ratio, 'cirrat': cir_ratio } if not cp_flag and not self._coll_in.get({'uuid': uid_uuid}, {'secu': 1}): self._coll_in.insert(data) elif cp_flag: self._coll_in.insert(data) print 'page: [{0}] done!'.format(page) self._coll_in.disconnect()
def main(self, multi_pool_page=None): assert multi_pool_page is None or isinstance( multi_pool_page, int), '`multi_pool_page` must None or int.' if multi_pool_page is None: start_page, end_page = 1, self.crawl_pages else: start_page, end_page = multi_pool_page, multi_pool_page for page in range(start_page, end_page + 1): url = self._base_url + self._query_string.format(page) secus, changes, afters, prices, scps, ncdrs = self.parse_data(url) for i in range(len(secus)): # Getting secu_code, orgid with `tick` from coll_stock table # Getting name_en, pid with `董监高姓名` from coll_exec table # Getting to_ratio, cir_ratio with `变动股份数量` and `secu_code` from coll_stock and coll_vary table # update data change_date greater than self._latest_cd_data cp_flag = self._latest_cd_data < ncdrs[i][1] secu_code, orgid = data_by_table_type({'tick': secus[i]}, [('code', ), ('org', 'id')], 'stock') query_name_en__pid = {'name.szh': ncdrs[i][0], 'orgid': orgid} name_en, pid = data_by_table_type(query_name_en__pid, [('name', 'en'), ('pid', )], 'exec') to_ratio, cir_ratio = ratio(changes[i], secu_code, 'stock', 'vary') # uuid is unique identifier name = ''.join([ secus[i], ncdrs[i][0], ncdrs[i][1], changes[i], prices[i], ncdrs[i][2], afters[i], scps[i].replace('\\', ''), ncdrs[i][3] ]) uid_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, name)) data = { 'secu': secu_code or secus[i], 'org': orgid, 'change': changes[i], 'after': afters[i], 'price': prices[i], 'cd': ncdrs[i][1], 'cause': ncdrs[i][2], 'relation': ncdrs[i][3], 'stat': 1, 'name': { 'szh': ncdrs[i][0], 'en': name_en }, 'scp': { 'szh': MarkReplace(secu_code, scps[i].replace('\\', '')).replace_mark(), 'en': data_by_table_type({'name.szh': scps[i]}, [('name', 'en')], 'exec') }, 'cur': 'CNY', 'rd': '', 'pid': pid, 'uuid': uid_uuid, 'typ': 'szx', 'upu': 'system', 'upt': datetime.now(), 'torat': to_ratio, 'cirrat': cir_ratio } if not cp_flag and not self._coll_in.get({'uuid': uid_uuid}, {'secu': 1}): self._coll_in.insert(data) elif cp_flag: self._coll_in.insert(data) print 'page: [{0}] done!'.format(page) self._coll_in.disconnect()