def get_annual_share_hold_info(page_list): lst = [] for page in page_list: py_all = py(page, parser='html') trs = py_all.find('table').find('tr').not_( '.partner_com_top').items() for tr in trs: tds = tr.find('td') if len(tds) < 2: continue share_model = { AnnualReports.ShareholderInformation.SHAREHOLDER_NAME: tds.eq(1).text().strip(), AnnualReports.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(tds.eq(2).text().strip()), AnnualReports.ShareholderInformation.SUBSCRIPTION_TIME: tds.eq(3).text().strip(), # 认缴时间 AnnualReports.ShareholderInformation.SUBSCRIPTION_TYPE: tds.eq(4).text().strip(), # 认缴类型 AnnualReports.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit( tds.eq(5).text().strip()), # 1实缴金额 AnnualReports.ShareholderInformation.PAIED_TIME: tds.eq(6).text().strip(), # 实缴时间 AnnualReports.ShareholderInformation.PAIED_TYPE: tds.eq(7).text().strip(), # 实缴类型 } lst.append(share_model) return lst
def get_shareholder_info(self, shareholder_info): shareholder_info_dict = {} shareholder_list = [] page_list = self.get_crawl_page(shareholder_info, True) for page in page_list: table = PyQuery(page.get('text', u''), parser='html').find('.table-result') trs = table.find('tr').items() amount_unit = util.get_amount_unit(table) for tr in trs: tds = tr.find('td') if tds is None or len(tds) <= 2: continue share_model = { GsModel.ShareholderInformation.SHAREHOLDER_NAME: tds.eq(0).text().replace(u'\\t', u''), GsModel.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(tds.eq(1).text()), GsModel.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit(tds.eq(2).text()), } tables = tds.find('table') if tables is None or len(tables) != 2: continue trs_sub = tables.eq(0).find('tr').items() share_subs = self.get_sharehold_info_sub_detail(trs_sub, amount_unit=amount_unit) share_model.update(share_subs) trs_paied = tables.eq(1).find('tr').items() share_paied = self.get_sharehold_info_sub_detail(trs_paied, amount_unit=amount_unit, is_subs_or_paied='paied') share_model.update(share_paied) shareholder_list.append(share_model) if len(shareholder_list) > 0: shareholder_info_dict[GsModel.SHAREHOLDER_INFORMATION] = shareholder_list return shareholder_info_dict
def get_annual_share_hold_info(gdcz_item): lst = [] for trs in gdcz_item: tds = trs.find('td') if tds.text() == '': continue share_model = { AnnualReports.ShareholderInformation.SHAREHOLDER_NAME: tds.eq(1).text(), AnnualReports.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(tds.eq(2).text()), AnnualReports.ShareholderInformation.SUBSCRIPTION_TIME: tds.eq(3).text(), AnnualReports.ShareholderInformation.SUBSCRIPTION_TYPE: tds.eq(4).text(), AnnualReports.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit(tds.eq(5).text()), AnnualReports.ShareholderInformation.PAIED_TIME: tds.eq(6).text(), AnnualReports.ShareholderInformation.PAIED_TYPE: tds.eq(7).text() } lst.append(share_model) return lst
def get_share_hold_detail(self, pages): shareholder_name = "" sub_model = {} if pages is None: return shareholder_name, sub_model for page in pages: page_text = page.get('text') items = json.loads(page_text) for item in items: if 'form' in item.keys(): form = item.get('form') shareholder_name = form.get('inv') sub_model[GsModel.ContributorInformation. SHAREHOLDER_NAME] = form.get('inv') sub_model[GsModel.ContributorInformation. SUBSCRIPTION_AMOUNT] = util.get_amount_with_unit( form.get('lisubconam')) sub_model[GsModel.ContributorInformation. PAIED_AMOUNT] = util.get_amount_with_unit( form.get('liacconam')) elif 'subconam' in page_text: lst_sub_details = [] lst_sub_list = item.get('list') for sub_item in lst_sub_list: sub_model_detail = { GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_TYPE: sub_item.get('conform_cn'), GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit( sub_item.get('subconam')), GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_TIME: sub_item.get('condate') } sub_model_detail = self.replace_none(sub_model_detail) lst_sub_details.append(sub_model_detail) sub_model[GsModel.ContributorInformation. SUBSCRIPTION_DETAIL] = lst_sub_details elif 'acconam' in page_text: lst_paid_details = [] lst_paid_list = item.get('list') for paid_item in lst_paid_list: paid_model_detail = { GsModel.ContributorInformation.PaiedDetail.PAIED_TYPE: paid_item.get('conform_cn'), GsModel.ContributorInformation.PaiedDetail.PAIED_AMOUNT: util.get_amount_with_unit( paid_item.get('acconam')), GsModel.ContributorInformation.PaiedDetail.PAIED_TIME: paid_item.get('condate') } paid_model_detail = self.replace_none( paid_model_detail) lst_paid_details.append(paid_model_detail) sub_model[GsModel.ContributorInformation. PAIED_DETAIL] = lst_paid_details sub_model = self.replace_none(sub_model) return shareholder_name, sub_model
def get_gdcz_info(self, table): gdcz_dict = {} lst = [] trs = table.find('.tablebodytext').items() for tr in trs: tds = tr.find('td') if tds is None or len(tds) < 7: continue share_model = { AnnualReports.ShareholderInformation.SHAREHOLDER_NAME: tds.eq(1).text(), AnnualReports.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(tds.eq(2).text()), AnnualReports.ShareholderInformation.SUBSCRIPTION_TIME: tds.eq(3).text(), AnnualReports.ShareholderInformation.SUBSCRIPTION_TYPE: tds.eq(4).text(), AnnualReports.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit(tds.eq(5).text()), AnnualReports.ShareholderInformation.PAIED_TIME: tds.eq(6).text(), AnnualReports.ShareholderInformation.PAIED_TYPE: tds.eq(7).text(), } lst.append(share_model) gdcz_dict[AnnualReports.SHAREHOLDER_INFORMATION] = lst return gdcz_dict
def get_annual_share_hold_info(self, json_list): lst = [] for js_item in json_list: js_item = replace_none(js_item) if js_item.get('subconform', '') == '': sub_con_form = '' else: sub_con_form = self.switch_type( js_item.get('subconform', '').replace(',', '')) if js_item.get('acconform', '') == '': ac_con_form = '' else: ac_con_form = self.switch_type( js_item.get('acconform', '').replace(',', '')) share_model = { AnnualReports.ShareholderInformation.SHAREHOLDER_NAME: js_item.get(u'inv'), AnnualReports.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(js_item.get(u'lisubconam')), AnnualReports.ShareholderInformation.SUBSCRIPTION_TIME: js_item.get(u'subcondate'), # 认缴时间 AnnualReports.ShareholderInformation.SUBSCRIPTION_TYPE: sub_con_form, # 认缴类型 #有坑 AnnualReports.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit(js_item.get(u'liacconam')), # 1实缴金额 AnnualReports.ShareholderInformation.PAIED_TIME: js_item.get(u'accondate'), # 实缴时间 AnnualReports.ShareholderInformation.PAIED_TYPE: ac_con_form, # 实缴类型 #有坑 } share_model = replace_none(share_model) lst.append(share_model) return lst
def _get_annual_sharehold_info(self, json_str): lst = [] if json_str is None or json_str == '': return lst js_obj = util.json_loads(json_str) for js_item in js_obj: share_model = { AnnualReports.ShareholderInformation.SHAREHOLDER_NAME: js_item.get(u'inv', u''), AnnualReports.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(js_item.get(u'subConAm', u'')), AnnualReports.ShareholderInformation.SUBSCRIPTION_TIME: str(js_item.get(u'conDate', 0)), AnnualReports.ShareholderInformation.SUBSCRIPTION_TYPE: js_item.get(u'conFormName', u''), AnnualReports.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit(js_item.get(u'acConAm', u'')), AnnualReports.ShareholderInformation.PAIED_TIME: str(js_item.get(u'realConDate', 0)), AnnualReports.ShareholderInformation.PAIED_TYPE: js_item.get(u'realConFormName', u''), } lst.append(share_model) return lst
def get_gdcz_info(self, page_text): gdcz_dict = {} lst = [] jq = PyQuery(page_text, parser='html') item_list = jq.find('.item_box').items() for item in item_list: if item.find('.item_title').text().find('股东及出资') != -1: trs = item.find('tr').items() for tr in trs: tds = tr.find('td') if tds is None or len(tds) < 8: continue share_model = { AnnualReports.ShareholderInformation.SHAREHOLDER_NAME: tds.eq(1).text(), AnnualReports.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(tds.eq(2).text()), AnnualReports.ShareholderInformation.SUBSCRIPTION_TIME: tds.eq(3).text(), AnnualReports.ShareholderInformation.SUBSCRIPTION_TYPE: tds.eq(4).text(), AnnualReports.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit(tds.eq(5).text()), AnnualReports.ShareholderInformation.PAIED_TIME: tds.eq(6).text(), AnnualReports.ShareholderInformation.PAIED_TYPE: tds.eq(7).text(), } lst.append(share_model) break gdcz_dict[AnnualReports.SHAREHOLDER_INFORMATION] = lst return gdcz_dict
def get_annual_share_hold_info(table): trs = table.find('tr').items() lst = [] for tr in trs: tds = tr.find('td') if len(tds) < 2: continue share_model = { AnnualReports.ShareholderInformation.SHAREHOLDER_NAME: tds.eq(1).text().strip(), AnnualReports.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(tds.eq(2).text().strip()), AnnualReports.ShareholderInformation.SUBSCRIPTION_TIME: tds.eq(3).text().strip(), # 认缴时间 AnnualReports.ShareholderInformation.SUBSCRIPTION_TYPE: tds.eq(4).text().strip(), # 认缴类型 # 1实缴金额 AnnualReports.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit(tds.eq(5).text().strip()), AnnualReports.ShareholderInformation.PAIED_TIME: tds.eq(6).text().strip(), # 实缴时间 AnnualReports.ShareholderInformation.PAIED_TYPE: tds.eq(7).text().strip(), # 实缴类型 } lst.append(share_model) return lst
def get_con_detail(page): shareholder_name = "" sub_model = {} if page is None or page == u'': return shareholder_name, sub_model tables = py(page, parser='html').find('.partner_com').items() for table in tables: if u'发起人' in table.find('.info_table_h3').text( ) or u'股东' in table.find('.info_table_h3').text(): # 股东信息 tds = table.find('td') shareholder_name = tds.eq(1).text().strip() sub_model[GsModel.ContributorInformation. SHAREHOLDER_NAME] = tds.eq(1).text() sub_model[GsModel.ContributorInformation. SUBSCRIPTION_AMOUNT] = util.get_amount_with_unit( tds.eq(3).text()) sub_model[GsModel.ContributorInformation. PAIED_AMOUNT] = util.get_amount_with_unit( tds.eq(5).text()) if u'认缴' in table.find('.info_table_h3').text(): # 认缴明细信息 trs = table.find('tr') lst_sub_detail = [] for tr_i in xrange(1, len(trs)): tds = trs.eq(tr_i).find('td') sub_model_detail = { GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_TYPE: tds.eq(0).text(), GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(tds.eq(1).text()), GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_TIME: tds.eq(2).text() } sub_model_detail = replace_none(sub_model_detail) lst_sub_detail.append(sub_model_detail) sub_model[GsModel.ContributorInformation. SUBSCRIPTION_DETAIL] = lst_sub_detail if u'实缴' in table.find('.info_table_h3').text(): # 实缴明细信息 trs = table.find('tr') lst_paid_detail = [] for tr_i in xrange(1, len(trs)): tds = trs.eq(tr_i).find('td') paid_model_detail = { GsModel.ContributorInformation.PaiedDetail.PAIED_TYPE: tds.eq(0).text(), GsModel.ContributorInformation.PaiedDetail.PAIED_AMOUNT: util.get_amount_with_unit(tds.eq(1).text()), GsModel.ContributorInformation.PaiedDetail.PAIED_TIME: tds.eq(2).text() } paid_model_detail = replace_none(paid_model_detail) # 补丁2 lst_paid_detail.append(paid_model_detail) sub_model[GsModel.ContributorInformation. PAIED_DETAIL] = lst_paid_detail sub_model = replace_none(sub_model) return shareholder_name, sub_model
def zj_get_share_hold_detail(tables): shareholder_name = "" sub_model = {} if tables is None: return shareholder_name, sub_model for table in tables: th_text = table.text() if u'发起人' in th_text \ or u'股东名称' in th_text \ or u'股东及出资人名称' in th_text \ or u'股东' in th_text: tds = table.find('td') if len(tds) == 6: sub_model[GsModel.ContributorInformation. SHAREHOLDER_NAME] = tds.eq( 1).text().strip().replace(u'.', u'') sub_model[GsModel.ContributorInformation. SUBSCRIPTION_AMOUNT] = util.get_amount_with_unit( tds.eq(3).text()) sub_model[GsModel.ContributorInformation. PAIED_AMOUNT] = util.get_amount_with_unit( tds.eq(5).text()) shareholder_name = sub_model[ GsModel.ContributorInformation.SHAREHOLDER_NAME] if u'认缴出资方式' in th_text: lst_sub_detail = [] tds = table.find('td') if len(tds) == 6: sub_model_detail = { GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_TYPE: tds.eq(3).text(), GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(tds.eq(1).text()), GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_TIME: tds.eq(5).text(), } lst_sub_detail.append(sub_model_detail) sub_model[GsModel.ContributorInformation. SUBSCRIPTION_DETAIL] = lst_sub_detail if u'实缴出资方式' in th_text: lst_paid_detail = [] tds = table.find('td') if len(tds) == 3: paid_model_detail = { GsModel.ContributorInformation.PaiedDetail.PAIED_TYPE: tds.eq(0).text(), GsModel.ContributorInformation.PaiedDetail.PAIED_AMOUNT: util.get_amount_with_unit(tds.eq(1).text()), GsModel.ContributorInformation.PaiedDetail.PAIED_TIME: tds.eq(2).text(), } lst_paid_detail.append(paid_model_detail) sub_model[GsModel.ContributorInformation. PAIED_DETAIL] = lst_paid_detail return shareholder_name, sub_model
def get_shareholder_info(self, shareholder_info): ''' :param shareholder_info: 网页库字典, 里面包含list 与 detail 两个列表, 列表中存储的为网页数据 其中两个列表一定会存在一个, 否则则认为这个数据包无效, list一般储存列表翻页信息, detail存储列表项详情信息 具体结构参考mongodb网页库或者查看 common/global_field.py 中Model定义注释 股东信息一般存储在list列表中, 因为股东信息不包含列表结构不需要detail列表 :return: 返回工商schema字典 ''' shareholder_info_dict = {} page_list = self.get_crawl_page(shareholder_info, True) shareholder_list = [] for page in page_list: table = PyQuery(page.get('text', u''), parser='html').find('.detailsList') trs = table.find('tr').items() amount_unit = util.get_amount_unit(table) for tr in trs: tds = tr.find('td') if tds is None or len(tds) <= 11: continue share_model = { GsModel.ShareholderInformation.SHAREHOLDER_NAME: tds.eq(1).text().replace(u'\\t', u''), GsModel.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(tds.eq(2).text(), amount_unit), GsModel.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit(tds.eq(3).text(), amount_unit), GsModel.ShareholderInformation.SUBSCRIPTION_DETAIL: [{ GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_TYPE: tds.eq(4).text(), GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit( tds.eq(5).text(), amount_unit), GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_TIME: tds.eq(6).text(), GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_PUBLISH_TIME: tds.eq(7).text(), }], GsModel.ShareholderInformation.PAIED_DETAIL: [{ GsModel.ShareholderInformation.PaiedDetail.PAIED_TYPE: tds.eq(8).text(), GsModel.ShareholderInformation.PaiedDetail.PAIED_AMOUNT: util.get_amount_with_unit( tds.eq(9).text(), amount_unit), GsModel.ShareholderInformation.PaiedDetail.PAIED_TIME: tds.eq(10).text(), GsModel.ShareholderInformation.PaiedDetail.PAIED_PUBLISH_TIME: tds.eq(11).text(), }] } shareholder_list.append(share_model) if len(shareholder_list) > 0: shareholder_info_dict[ GsModel.SHAREHOLDER_INFORMATION] = shareholder_list return shareholder_info_dict
def get_shareholder_info(self, shareholder_info): shareholder_info_dict = {} shareholder_list = [] page_items = self.get_crawl_page(shareholder_info, True) for page in page_items: text = page.get('text') page_lists = json.loads(text) for page_list in page_lists: page_data_list = page_list.get('list', []) for data in page_data_list: if data is None or len(data) == 0: continue share_model = { GsModel.ShareholderInformation.SHAREHOLDER_NAME: data.get('inv'), GsModel.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(data.get('lisubconam')), GsModel.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit(data.get('liacconam')), } for sub_and_paid_detail in data.get('subList'): sub_dict = { GsModel.ShareholderInformation.SUBSCRIPTION_TYPE: sub_and_paid_detail.get('e_conform_cn'), GsModel.ShareholderInformation.SUBSCRIPTION_TIME: sub_and_paid_detail.get('e_condate'), GsModel.ShareholderInformation.SUBSCRIPTION_PUBLISH_TIME: sub_and_paid_detail.get('e_publicdate') } sub_dict = self.replace_none(sub_dict) share_model[GsModel.ShareholderInformation. SUBSCRIPTION_DETAIL] = [sub_dict] paid_dict = { GsModel.ShareholderInformation.PAIED_TYPE: sub_and_paid_detail.get('p_conform_cn'), GsModel.ShareholderInformation.PAIED_TIME: sub_and_paid_detail.get('p_condate'), GsModel.ShareholderInformation.PAIED_PUBLISH_TIME: sub_and_paid_detail.get('p_publicdate') } paid_dict = self.replace_none(paid_dict) share_model[ GsModel.ShareholderInformation.PAIED_DETAIL] = [ paid_dict ] share_model = self.replace_none(share_model) shareholder_list.append(share_model) if len(shareholder_list) > 0: shareholder_info_dict[ GsModel.SHAREHOLDER_INFORMATION] = shareholder_list return shareholder_info_dict
def get_con_detail(self, page): shareholder_name = "" sub_model = {} json_data_arr = util.json_loads(page) if json_data_arr is None: return shareholder_name, sub_model for json_item in json_data_arr: if len(json_item.get('tRegTzrrjxxList')) != 0: rj = json_item.get('tRegTzrrjxxList')[0] # 认缴 else: rj = {} if len(json_item.get('tRegTzrsjxxList')) != 0: sj = json_item.get('tRegTzrsjxxList')[0] # 实缴 else: sj = {} if len(json_item.get('tRegTzrxx')) != 0: other = json_item.get('tRegTzrxx') shareholder_name = other.get('inv', '') sub_model[GsModel.ContributorInformation.SHAREHOLDER_NAME] = other.get('inv', '') sub_model[GsModel.ContributorInformation.SUBSCRIPTION_AMOUNT] = util.get_amount_with_unit( other.get('lisubconam', '')) sub_model[GsModel.ContributorInformation.PAIED_AMOUNT] = util.get_amount_with_unit( other.get('liacconam', '')) lst_sub_detail = [] sub_model_detail = { GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_TYPE: rj.get('conformName', ''), GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit( rj.get('subconam', '')), GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_TIME: rj.get('condate', ''), } sub_model_detail = self.bu_ding(sub_model_detail) # 补丁 lst_sub_detail.append(sub_model_detail) sub_model[GsModel.ContributorInformation.SUBSCRIPTION_DETAIL] = lst_sub_detail lst_paid_detail = [] paid_model_detail = { GsModel.ContributorInformation.PaiedDetail.PAIED_TYPE: sj.get('conformName', ''), GsModel.ContributorInformation.PaiedDetail.PAIED_AMOUNT: util.get_amount_with_unit( sj.get('acconam', '')), GsModel.ContributorInformation.PaiedDetail.PAIED_TIME: sj.get('condate', ''), } paid_model_detail = self.bu_ding(paid_model_detail) # 补丁 lst_paid_detail.append(paid_model_detail) sub_model[GsModel.ContributorInformation.PAIED_DETAIL] = lst_paid_detail sub_model = self.bu_ding(sub_model) # 补丁 return shareholder_name, sub_model
def _get_sharehold_detail(self, tables, name_dom="th"): shareholder_name = "" sub_model = {} if tables is None: return shareholder_name, sub_model for table in tables: th_text = table.find(name_dom).text() if u'发起人' in th_text \ or u'股东名称' in th_text \ or u'股东及出资人名称' in th_text \ or u'股东' in th_text: tds = table.find('td') steps = xrange(3) if name_dom == "th" else [i * 2 + 1 for i in xrange(3)] shareholder_name = tds.eq(steps[0]).text().strip().replace(u'.', u'') sub_model[GsModel.ContributorInformation.SHAREHOLDER_NAME] = tds.eq(steps[0]).text() sub_model[GsModel.ContributorInformation.SUBSCRIPTION_AMOUNT] = util.get_amount_with_unit( tds.eq(steps[1]).text()) sub_model[GsModel.ContributorInformation.PAIED_AMOUNT] = util.get_amount_with_unit( tds.eq(steps[2]).text()) if u'认缴出资方式' in th_text: trs = table.find('tr') lst_sub_detail = [] for tr_i in xrange(1, len(trs)): tds = trs.eq(tr_i).find('td') if len(tds) <= 2: continue sub_model_detail = { GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_TYPE: tds.eq(0).text(), GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit( tds.eq(1).text()), GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_TIME: tds.eq(2).text(), } lst_sub_detail.append(sub_model_detail) sub_model[GsModel.ContributorInformation.SUBSCRIPTION_DETAIL] = lst_sub_detail if u'实缴出资方式' in th_text: trs = table.find('tr') lst_paied_detail = [] for tr_i in xrange(1, len(trs)): tds = trs.eq(tr_i).find('td') if len(tds) <= 2: continue paied_model_detail = { GsModel.ContributorInformation.PaiedDetail.PAIED_TYPE: tds.eq(0).text(), GsModel.ContributorInformation.PaiedDetail.PAIED_AMOUNT: util.get_amount_with_unit( tds.eq(1).text()), GsModel.ContributorInformation.PaiedDetail.PAIED_TIME: tds.eq(2).text(), } lst_paied_detail.append(paied_model_detail) sub_model[GsModel.ContributorInformation.PAIED_DETAIL] = lst_paied_detail return shareholder_name, sub_model
def get_inline_shareholder_info(self, page): shareholder_info_dict = {} lst_shareholder = [] # for page in pages: table = PyQuery(page, parser='html').find('#UpdatePanel1').find('table') trs = table.find('tr').items() amount_unit = util.get_amount_unit(table) for tr in trs: start_index = 0 tds = tr.find('td') if tds is None or len(tds) <= start_index + 10: continue share_model = { GsModel.ShareholderInformation.SHAREHOLDER_NAME: tds.eq(start_index).text().replace(u'\\t', u''), GsModel.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit( tds.eq(start_index + 1).text(), amount_unit), GsModel.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit( tds.eq(start_index + 2).text(), amount_unit), GsModel.ShareholderInformation.SUBSCRIPTION_DETAIL: [{ GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_TYPE: tds.eq(start_index + 3).text(), GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit( tds.eq(start_index + 4).text(), amount_unit), GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_TIME: tds.eq(start_index + 5).text(), GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_PUBLISH_TIME: tds.eq(start_index + 6).text(), }], GsModel.ShareholderInformation.PAIED_DETAIL: [{ GsModel.ShareholderInformation.PaiedDetail.PAIED_TYPE: tds.eq(start_index + 7).text(), GsModel.ShareholderInformation.PaiedDetail.PAIED_AMOUNT: util.get_amount_with_unit( tds.eq(start_index + 8).text(), amount_unit), GsModel.ShareholderInformation.PaiedDetail.PAIED_TIME: tds.eq(start_index + 9).text(), GsModel.ShareholderInformation.PaiedDetail.PAIED_PUBLISH_TIME: tds.eq(start_index + 10).text(), }] } if len(share_model) > 0: lst_shareholder.append(share_model) shareholder_info_dict[ GsModel.SHAREHOLDER_INFORMATION] = lst_shareholder return shareholder_info_dict
def get_shareholder_info(self, shareholder_info): shareholder_info_dict = {} lst_shareholder = [] pages = self.get_crawl_page(shareholder_info, True) if pages is None: return {} for page in pages: trs = py(page.get('text', u''), parser='html').find( '.partner_com').find('tr').not_('.partner_com_top').items() for tr in trs: tds = tr.find('td') if tds is None or len(tds) < 2: continue share_model = { GsModel.ShareholderInformation.SHAREHOLDER_NAME: tds.eq(1).text().replace(u'\\t', u''), GsModel.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(tds.eq(2).text()), GsModel.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit(tds.eq(3).text()), # 实缴 # 认缴细节 GsModel.ShareholderInformation.SUBSCRIPTION_DETAIL: [{ GsModel.ShareholderInformation.SUBSCRIPTION_TYPE: tds.eq(4).text(), # 认缴方式 GsModel.ShareholderInformation.SUBSCRIPTION_TIME: tds.eq(6).text(), # 认缴时间 GsModel.ShareholderInformation.SUBSCRIPTION_PUBLISH_TIME: tds.eq(10).text(), # 认缴公式时间 }], # 实缴细节 GsModel.ShareholderInformation.PAIED_DETAIL: [{ GsModel.ShareholderInformation.PAIED_TYPE: tds.eq(7).text(), # 实缴类型 GsModel.ShareholderInformation.PAIED_TIME: tds.eq(9).text(), # 实缴 时间 GsModel.ShareholderInformation.PAIED_PUBLISH_TIME: tds.eq(10).text(), # 实缴公式时间 }] } lst_shareholder.append(share_model) shareholder_info_dict[ GsModel.SHAREHOLDER_INFORMATION] = lst_shareholder return shareholder_info_dict
def get_dwdb_info(self, table): dwdb_dict = {} lst = [] trs = table.find('.tablebodytext').items() for tr in trs: tds = tr.find('td') if tds is None or len(tds) < 7: continue model = { AnnualReports.OutGuaranteeInfo.CREDITOR: tds.eq(0).text(), AnnualReports.OutGuaranteeInfo.OBLIGOR: tds.eq(1).text(), AnnualReports.OutGuaranteeInfo.DEBT_TYPE: tds.eq(2).text(), AnnualReports.OutGuaranteeInfo.DEBT_AMOUNT: util.get_amount_with_unit(tds.eq(3).text()), AnnualReports.OutGuaranteeInfo.PERFORMANCE_PERIOD: tds.eq(4).text(), AnnualReports.OutGuaranteeInfo.GUARANTEE_PERIOD: tds.eq(5).text(), AnnualReports.OutGuaranteeInfo.GUARANTEE_TYPE: tds.eq(6).text(), AnnualReports.OutGuaranteeInfo.GUARANTEE_PURVIEW: tds.eq(7).text(), } lst.append(model) dwdb_dict[AnnualReports.OUT_GUARANTEE_INFO] = lst return dwdb_dict
def _get_annual_out_guarantee_info(self, json_str): lst = [] if json_str is None or json_str == '': return lst js_obj = util.json_loads(json_str) for js_item in js_obj: perfrom = js_item.get(u'pefPerForm', 0) perto = js_item.get(u'pefPerTo', 0) share_model = { AnnualReports.OutGuaranteeInfo.CREDITOR: js_item.get(u'more', u''), AnnualReports.OutGuaranteeInfo.OBLIGOR: js_item.get(u'mortgagor', u''), AnnualReports.OutGuaranteeInfo.DEBT_TYPE: js_item.get(u'priClaSecKindName', u''), AnnualReports.OutGuaranteeInfo.DEBT_AMOUNT: util.get_amount_with_unit(js_item.get(u'priClaSecAm', u'')), AnnualReports.OutGuaranteeInfo.PERFORMANCE_PERIOD: u"{0}-{1}".format(perfrom, perto), AnnualReports.OutGuaranteeInfo.GUARANTEE_PERIOD: js_item.get(u'guaranPeriodName', u''), AnnualReports.OutGuaranteeInfo.GUARANTEE_TYPE: js_item.get(u'gaTypeName', u''), } lst.append(share_model) return lst
def get_dwdb_info(self, page_text): dwdb_dict = {} lst = [] jq = PyQuery(page_text, parser='html') item_box_list = jq.find('.item_box').items() for item_box in item_box_list: title = item_box.find('.item_title').text() if title.find('担保') != -1 and title.find('对外') != -1: trs = item_box.find('tr').items() for tr in trs: tds = tr.find('td') if tds is None or len(tds) < 8: continue model = { AnnualReports.OutGuaranteeInfo.CREDITOR: tds.eq(1).text(), AnnualReports.OutGuaranteeInfo.OBLIGOR: tds.eq(2).text(), AnnualReports.OutGuaranteeInfo.DEBT_TYPE: tds.eq(3).text(), AnnualReports.OutGuaranteeInfo.DEBT_AMOUNT: util.get_amount_with_unit(tds.eq(4).text()), AnnualReports.OutGuaranteeInfo.PERFORMANCE_PERIOD: tds.eq(5).text(), AnnualReports.OutGuaranteeInfo.GUARANTEE_PERIOD: tds.eq(6).text(), AnnualReports.OutGuaranteeInfo.GUARANTEE_TYPE: tds.eq(7).text(), } lst.append(model) break dwdb_dict[AnnualReports.OUT_GUARANTEE_INFO] = lst return dwdb_dict
def get_annual_out_guarantee_info(self, trs): lst = [] for tr in trs: tds = tr.find('td') if len(tds) < 2: continue performance = tds.eq(5).text().strip() performance_period = self.trans_for(performance) share_model = { AnnualReports.OutGuaranteeInfo.CREDITOR: tds.eq(1).text().strip(), # AnnualReports.OutGuaranteeInfo.OBLIGOR: tds.eq(2).text().strip(), # AnnualReports.OutGuaranteeInfo.DEBT_TYPE: tds.eq(3).text().strip(), # AnnualReports.OutGuaranteeInfo.DEBT_AMOUNT: util.get_amount_with_unit(tds.eq(4).text().strip()), AnnualReports.OutGuaranteeInfo.PERFORMANCE_PERIOD: performance_period, AnnualReports.OutGuaranteeInfo.GUARANTEE_PERIOD: tds.eq(6).text().strip(), # 担保期限 AnnualReports.OutGuaranteeInfo.GUARANTEE_TYPE: tds.eq(7).text().strip(), # 担保方式 } lst.append(share_model) return lst
def get_annual_out_guarantee_info(self, page_list): lst = [] for page in page_list: py_all = py(page, parser='html') trs = py_all.find('table').find('tr').not_( '.partner_com_top').items() for tr in trs: tds = tr.find('td') if len(tds) < 2: continue performance = tds.eq(5).text().strip() performance_period = self.trans_for(performance) share_model = { AnnualReports.OutGuaranteeInfo.CREDITOR: tds.eq(1).text().strip(), # AnnualReports.OutGuaranteeInfo.OBLIGOR: tds.eq(2).text().strip(), # AnnualReports.OutGuaranteeInfo.DEBT_TYPE: tds.eq(3).text().strip(), # AnnualReports.OutGuaranteeInfo.DEBT_AMOUNT: util.get_amount_with_unit(tds.eq(4).text().strip()), AnnualReports.OutGuaranteeInfo.PERFORMANCE_PERIOD: performance_period, AnnualReports.OutGuaranteeInfo.GUARANTEE_PERIOD: tds.eq(6).text().strip(), # 担保期限 AnnualReports.OutGuaranteeInfo.GUARANTEE_TYPE: tds.eq(7).text().strip(), # 担保方式 } lst.append(share_model) return lst
def _get_annual_sharehold_info(self, py_items): lst = [] for item in py_items: tds = item.find('td') if len(tds) < 8: continue share_model = { AnnualReports.ShareholderInformation.SHAREHOLDER_NAME: tds.eq(1).text(), AnnualReports.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(tds.eq(2).text()), AnnualReports.ShareholderInformation.SUBSCRIPTION_TIME: tds.eq(3).text(), AnnualReports.ShareholderInformation.SUBSCRIPTION_TYPE: tds.eq(4).text(), AnnualReports.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit(tds.eq(5).text()), AnnualReports.ShareholderInformation.PAIED_TIME: tds.eq(6).text(), AnnualReports.ShareholderInformation.PAIED_TYPE: tds.eq(7).text(), } lst.append(share_model) return lst
def get_shareholder_info(self, page): shareholder_info_dict = {} lst_shareholder = [] if isinstance(page, dict) or page is None: return {} trs = py(page, parser='html').find('#table_qytzr').find('tr').items() for tr in trs: tds = tr.find('td') if tds is None or len(tds) < 2: continue share_model = { GsModel.ShareholderInformation.SHAREHOLDER_NAME: tds.eq(0).text().replace(u'\\t', u''), GsModel.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(tds.eq(1).text()), # 认缴 GsModel.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit(tds.eq(2).text()), # 实缴 # 认缴细节 GsModel.ShareholderInformation.SUBSCRIPTION_DETAIL: [{ GsModel.ShareholderInformation.SUBSCRIPTION_TYPE: tds.eq(3).text(), # 认缴方式 GsModel.ShareholderInformation.SUBSCRIPTION_TIME: tds.eq(5).text(), # 认缴时间 GsModel.ShareholderInformation.SUBSCRIPTION_PUBLISH_TIME: tds.eq(6).text(), # 认缴公式时间 }], # 实缴细节 GsModel.ShareholderInformation.PAIED_DETAIL: [{ GsModel.ShareholderInformation.PAIED_TYPE: tds.eq(7).text(), # 实缴类型 GsModel.ShareholderInformation.PAIED_TIME: tds.eq(9).text(), # 实缴 时间 GsModel.ShareholderInformation.PAIED_PUBLISH_TIME: tds.eq(10).text(), # 实缴公式时间 }] } lst_shareholder.append(share_model) shareholder_info_dict[ GsModel.SHAREHOLDER_INFORMATION] = lst_shareholder return shareholder_info_dict
def get_gdcz_info(self, page_text): lst = [] trs = PyQuery(page_text, parser='html').find('tr').items() for tr in trs: tds = tr.find('td') if tds is None or len(tds) < 7: continue share_model = { AnnualReports.ShareholderInformation.SHAREHOLDER_NAME: tds.eq(1).text(), AnnualReports.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(tds.eq(2).text()), AnnualReports.ShareholderInformation.SUBSCRIPTION_TIME: tds.eq(3).text(), AnnualReports.ShareholderInformation.SUBSCRIPTION_TYPE: tds.eq(4).text(), AnnualReports.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit(tds.eq(5).text()), AnnualReports.ShareholderInformation.PAIED_TIME: tds.eq(6).text(), AnnualReports.ShareholderInformation.PAIED_TYPE: tds.eq(7).text(), } lst.append(share_model) return lst
def get_shareholder_info(self, shareholder_info): shareholder_info_dict = {} pages = self.get_crawl_page(shareholder_info, True) if pages is None or len(pages) <= 0: return {} lst_shareholder = [] for page in pages: res_text = page.get(u'text', u'{}') json_data = util.json_loads(res_text) if json_data is None: self.log.error( 'json转换失败: res_text = {text}'.format(text=res_text)) continue json_list_dict = json_data.get(u'list', {}) obj_list = json_list_dict.get(u'list', []) if obj_list is None: self.log.info('没有 股东信息..') continue for obj in obj_list: share_model = { GsModel.ShareholderInformation.SHAREHOLDER_NAME: str(obj.get(u'invName', u'')), GsModel.ShareholderInformation.SUBSCRIPTION_AMOUNT: str(util.get_amount_with_unit(obj.get(u'subConAm', u''))), GsModel.ShareholderInformation.PAIED_AMOUNT: str(util.get_amount_with_unit(obj.get(u'acConAm', u''))), GsModel.ShareholderInformation.SUBSCRIPTION_DETAIL: self.get_sharehold_info_sub_paid_detail( obj.get(u'invFormList', [])), GsModel.ShareholderInformation.PAIED_DETAIL: self.get_sharehold_info_ac_paid_detail( obj.get(u'invFormList', [])) } lst_shareholder.append(share_model) # 因为数据是通过json中读取的,而每一页都包含了所有数据,所以只需要解析一页的数据 break shareholder_info_dict[ GsModel.SHAREHOLDER_INFORMATION] = lst_shareholder return shareholder_info_dict
def get_sharehold_info_sub_paid_detail(self, json_obj, amount_unit=u"万元"): lst = [] if json_obj is None or len(json_obj) == 0: return lst for obj in json_obj: model = {} sub_type = obj.get(u'paidForm', u'') if sub_type is None: model[GsModel.ShareholderInformation.SubscriptionDetail. SUBSCRIPTION_TYPE] = "" else: model[GsModel.ShareholderInformation.SubscriptionDetail. SUBSCRIPTION_TYPE] = sub_type amount = util.get_amount_with_unit(obj.get(u'paidAm', u''), amount_unit) if amount is None: model[GsModel.ShareholderInformation.SubscriptionDetail. SUBSCRIPTION_AMOUNT] = "" else: model[GsModel.ShareholderInformation.SubscriptionDetail. SUBSCRIPTION_AMOUNT] = amount sub_time = obj.get(u'paidDate', 0) if sub_time is None: model[GsModel.ShareholderInformation.SubscriptionDetail. SUBSCRIPTION_TIME] = "" else: model[GsModel.ShareholderInformation.SubscriptionDetail. SUBSCRIPTION_TIME] = sub_time publish_time = obj.get(u'paidDate', 0) if publish_time is None: model[GsModel.ShareholderInformation.SubscriptionDetail. SUBSCRIPTION_TIME] = "" else: model[GsModel.ShareholderInformation.SubscriptionDetail. SUBSCRIPTION_TIME] = publish_time # model = { # GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_TYPE: str(obj.get(u'paidForm', u'')), # GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_AMOUNT: str(util.get_amount_with_unit( # obj.get(u'paidAm', u''), amount_unit)), # GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_TIME: str(obj.get(u'paidDate', 0)), # GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_PUBLISH_TIME: str( # obj.get(u'paidPubDate', 0)), # } lst.append(model) return lst
def get_inline_shareholder_info(page): shareholder_info_dict = {} shareholder_list = [] trs = PyQuery(page, parser='html').find('#gd_JSTab').find('tr').items() for tr in trs: tds = tr.find('td') if tds is None or len(tds) < 2: continue share_model = { GsModel.ShareholderInformation.SHAREHOLDER_NAME: tds.eq(1).text().replace(u'\\t', u''), GsModel.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(tds.eq(2).text()), GsModel.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit(tds.eq(3).text()), GsModel.ShareholderInformation.SUBSCRIPTION_DETAIL: [{ GsModel.ShareholderInformation.SUBSCRIPTION_TYPE: tds.eq(4).text(), GsModel.ShareholderInformation.SUBSCRIPTION_TIME: tds.eq(6).text(), GsModel.ShareholderInformation.SUBSCRIPTION_PUBLISH_TIME: tds.eq(7).text(), }], GsModel.ShareholderInformation.PAIED_DETAIL: [{ GsModel.ShareholderInformation.PAIED_TYPE: tds.eq(8).text(), GsModel.ShareholderInformation.PAIED_TIME: tds.eq(10).text(), GsModel.ShareholderInformation.PAIED_PUBLISH_TIME: tds.eq(11).text() }] } shareholder_list.append(share_model) if len(shareholder_list) > 0: shareholder_info_dict[ GsModel.SHAREHOLDER_INFORMATION] = shareholder_list return shareholder_info_dict
def get_annual_share_hold_info(page_web): lst = [] big_json = util.json_loads(page_web) if big_json is None: return {}, False json_data_shareholder = big_json.get('data', '') if json_data_shareholder == '' or json_data_shareholder is None: return {}, False for js_item in json_data_shareholder: sub_con_date_time = '' ac_con_date_time = '' sub_con_date = js_item.get('subConDate', u'') ac_con_date = js_item.get('acConDate', u'') if sub_con_date != '' and sub_con_date is not None: sub_con_date_time = str(sub_con_date) if ac_con_date != '' and ac_con_date is not None: ac_con_date_time = str(ac_con_date) share_model = { AnnualReports.ShareholderInformation.SHAREHOLDER_NAME: js_item.get('invName', u''), AnnualReports.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(js_item.get('liSubConAm', u'')), AnnualReports.ShareholderInformation.SUBSCRIPTION_TIME: sub_con_date_time, # 认缴时间 AnnualReports.ShareholderInformation.SUBSCRIPTION_TYPE: js_item.get('subConFormName', u''), # 认缴类型 #有坑 AnnualReports.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit(js_item.get('liAcConAm', u'')), # 1实缴金额 AnnualReports.ShareholderInformation.PAIED_TIME: ac_con_date_time, # 实缴时间 AnnualReports.ShareholderInformation.PAIED_TYPE: js_item.get('acConForm_CN', u''), # 实缴类型 #有坑 } lst.append(share_model) return lst
def get_sharehold_info_ac_paid_detail(self, json_obj, amount_unit=u"万元"): lst = [] if json_obj is None or len(json_obj) == 0: return lst for obj in json_obj: model = {} acform = obj.get(u'acForm', u'') if acform is None: model[ GsModel.ShareholderInformation.PaiedDetail.PAIED_TYPE] = "" else: model[GsModel.ShareholderInformation.PaiedDetail. PAIED_TYPE] = acform acam = util.get_amount_with_unit(obj.get(u'acAm', u''), amount_unit) if acam is None: model[GsModel.ShareholderInformation.PaiedDetail. PAIED_AMOUNT] = "" else: model[GsModel.ShareholderInformation.PaiedDetail. PAIED_AMOUNT] = acam acdate = obj.get(u'acDate', 0) if acdate is None: model[ GsModel.ShareholderInformation.PaiedDetail.PAIED_TIME] = "" else: model[GsModel.ShareholderInformation.PaiedDetail. PAIED_TIME] = acdate acpubdate = obj.get(u'acPubDate', 0) if acpubdate is None: model[GsModel.ShareholderInformation.PaiedDetail. PAIED_PUBLISH_TIME] = "" else: model[GsModel.ShareholderInformation.PaiedDetail. PAIED_PUBLISH_TIME] = acpubdate # model = { # GsModel.ShareholderInformation.PaiedDetail.PAIED_TYPE: str(obj.get(u'acForm', u'')), # GsModel.ShareholderInformation.PaiedDetail.PAIED_AMOUNT: str(util.get_amount_with_unit( # obj.get(u'acAm', u''), amount_unit)), # GsModel.ShareholderInformation.PaiedDetail.PAIED_TIME: str(obj.get(u'acDate', 0)), # GsModel.ShareholderInformation.PaiedDetail.PAIED_PUBLISH_TIME: str(obj.get(u'acPubDate', 0)), # } lst.append(model) return lst