def name(self, data): information = {} for i, singledata in enumerate(data): types = '黑名单' if "bulletinListed" in singledata.keys(): in_reason = singledata["bulletinListed"] in_reason = deal_html_code.remove_symbol(in_reason) else: in_reason = '' if "abnTime" in singledata.keys(): in_date = singledata["abnTime"] in_date = change_chinese_date(in_date) else: in_date = '0000-00-00' if "bulletinRemoved" in singledata.keys(): out_reason = singledata["bulletinRemoved"] out_reason = deal_html_code.remove_symbol(out_reason) else: out_reason = '' if "remTime" in singledata.keys(): out_date = singledata["remTime"] out_date = change_chinese_date(out_date) else: out_date = '0000-00-00' if "remOrganInterpreted" in singledata.keys(): gov_dept = singledata["remOrganInterpreted"] else: gov_dept = '' information[i] = [ types, in_reason, in_date, out_reason, out_date, gov_dept ] return information
def deal_single_info(result): infolist = {} # 注意设置编码格式防止乱码 content = etree.HTML(result, parser=etree.HTMLParser(encoding='utf-8')) url = config.host + content.xpath("//*[@class = 'moreInfo']/a/@href")[0] infolist["href"] = url ddlist = content.xpath("//dl/dt") codetemp = content.xpath("//dd[@style='color:red;']") if len(codetemp) == 1: string = codetemp[0].xpath('string(.)') pattern = re.compile(r"\d+") code = re.findall(pattern, string) if len(code) == 1: code = code[0] else: code = None else: code = None infolist['code'] = code for i, single in enumerate(ddlist, 0): key = single.xpath("string(.)") key = deal_html_code.remove_symbol(key) dd = single.xpath("./following-sibling::*[1]")[0].xpath("string(.)") dd = deal_html_code.remove_symbol(dd) infolist[key] = dd return infolist
def get_need_info(result): url, company, history_name = {}, {}, {} a_list = result.find('div', {"class", "main-layout fw f14"}).find_all( "a", {"class": "search_list_item db"}) for i, item in enumerate(a_list): href = item["href"] url[i] = url_first + href company[i] = item.find("h1", {"class": "f20"}).text.strip() company[i] = remove_symbol(company[i]) if item.find("div", {"class": "div-info-circle3"}) != None: history = item.find("div", { "class": "div-info-circle3" }).find("span", { "class": "g3" }).text.strip() else: history = None history = remove_symbol(history) if history != None: history = history.replace(u'*', '') if history != None: list = re.split(';', str(history)) templist = [] for k, temp in enumerate(list): if temp != u'': templist.append(temp) history = ';'.join(templist) history_name[i] = history return url, company, history_name
def get_info(self, data): info = {} tr_list = data.xpath(".//table[@id='table_dcdy']//tr[@name = 'dcdy']") for i, singledata in enumerate(tr_list): temp = {} td_list = singledata.xpath("./td") if len(td_list) == 0: continue temp["code"] = deal_html_code.remove_symbol( td_list[1].xpath("string(.)")) dates = deal_html_code.remove_symbol(td_list[2].xpath("string(.)")) temp["dates"] = deal_html_code.change_date_style(dates) temp["dept"] = deal_html_code.remove_symbol( td_list[3].xpath("string(.)")) temp["amount"] = deal_html_code.remove_symbol( td_list[4].xpath("string(.)")) temp["status"] = deal_html_code.remove_symbol( td_list[5].xpath("string(.)")) onclick = td_list[6].xpath("./a/@onclick")[0] tuple = deal_html_code.match_key_content(str(onclick)) xh = tuple[0] detail_url = self._url.format(self._pripid, xh) person_info, goods_info = self.get_detail_info(detail_url, temp) temp["person_info"] = person_info temp["goods_info"] = goods_info info[i] = temp return info
def get_detail(self, string, data, json_data, flag): table = data.xpath("//*[contains(.,'%s')]" % string)[0].xpath(".//following-sibline::*[1]") td = table[0].xpath(".//td") if flag == 'rj': if len(td) < 3: logging.info("该条数据无认缴信息!") json_data["ra_ways"] = '' json_data["ra_date"] = '0000-00-00' else: ra_ways = deal_html_code.remove_symbol( td[0].xpath("string(.)")) ra_date = deal_html_code.remove_symbol( td[2].xpath("string(.)")) ra_date = deal_html_code.change_chinese_date(ra_date) json_data["ra_ways"] = ra_ways json_data["ra_date"] = ra_date elif flag == 'sj': if len(td) < 3: logging.info("该条数据无实缴信息!") json_data["ta_ways"] = '' json_data["ta_date"] = '0000-00-00' else: ta_ways = deal_html_code.remove_symbol( td[0].xpath("string(.)")) ta_date = deal_html_code.remove_symbol( td[2].xpath("string(.)")) ta_date = deal_html_code.change_chinese_date(ta_date) json_data["ta_ways"] = ta_ways json_data["ta_date"] = ta_date
def get_single_info(self, items): dict = {} openView = items.xpath("./@onclick") pattern = re.compile("openView\('(.*?)','(.*?)','(.*?)','(.*?)'\)") tuple = re.findall(pattern, str(openView))[0] pripid = tuple[0] dict["prirpid"] = pripid types = tuple[3] dict["types"] = types company = items.xpath(".//span[@id = 'mySpan']/@title")[0] dict["company"] = company status = items.xpath(".//span[@id = 'mySpan']/following-sibling::*[1]") status = status[0].xpath("string(.)") status = deal_html_code.remove_symbol(status) dict["status"] = status code = items.xpath("//span[@class = 'shxydm']")[0] code = code.xpath("string(.)").split(":")[1] dict["code"] = code legal_person = items.xpath(".//span[@class = 'fddbr']")[0].xpath( "string(.)") legal_person = deal_html_code.remove_symbol(legal_person) # 将企业负责人的职位确定出来 self.judge_position(legal_person, dict) reg_date = items.xpath(".//span[@class= 'clrq']")[0].xpath("string(.)") reg_date = reg_date.split(":")[1] reg_date = deal_html_code.change_chinese_date(reg_date) dict["reg_date"] = reg_date return dict
def deal_single_info(self, href, i, info, cookies): result = requests.get(href, headers=headers, cookies=cookies, timeout=5) status_code = result.status_code if status_code == 200: content = result.content result = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) types = "变更" #变更分为带表格的变更事项和不带表格的变更事项 string = u'变更时间' plist = result.xpath(".//p[contains(.,'%s')]" % string) #处理不带表格的信息 if len(plist) == 0: ddlist = result.xpath(".//dl/dd/text()") change_date = deal_html_code.remove_symbol(ddlist[0]) item = deal_html_code.remove_symbol(ddlist[1]) content_before = deal_html_code.remove_symbol(ddlist[2]) content_after = deal_html_code.remove_symbol(ddlist[3]) info[i] = [ types, change_date, item, content_before, content_after ] elif len(plist) == 1: change_date, item, change_before, change_after = self.deal_table_info( result) info[i] = [ types, change_date, item, change_before, change_after ]
def name(self, url): info = {} headers = config.headers_detail content, status_code = Send_Request().send_request(url, headers) if status_code == 200: flag = 1 result = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) # total = result.xpath("//table[@id='tableIdStyle']//div/text()")[0] # pattern = re.compile(u".*记录总数(.*?)条.*") # number = re.findall(pattern,total) # if len(number)==1: # temp =int(number[0]) trlist = result.xpath("//table[@id = 'tableIdStyle']//tr") for i, single in enumerate(trlist): tdlist = single.xpath("./td") if len(tdlist) == 0 or len(tdlist) < 4: pass else: name = deal_html_code.remove_symbol( tdlist[1].xpath("string(.)")) code = deal_html_code.remove_symbol( tdlist[2].xpath("string(.)")) gov_dept = deal_html_code.remove_symbol( tdlist[5].xpath("string(.)")) info[i] = [name, code, gov_dept] else: flag = 100000004 return info, flag
def get_info(self, data): info = {} tr_list = data.xpath(".//table[@id='table_sfxz']//tr[@name = 'sfxz']") for i, singledata in enumerate(tr_list): temp = {} td_list = singledata.xpath("./td") if len(td_list) == 0: continue executor = deal_html_code.remove_symbol( td_list[1].xpath("string(.)")) temp["exceutor"] = executor stock_amount = deal_html_code.remove_symbol( td_list[2].xpath("string(.)")) temp["stock_amount"] = stock_amount court = deal_html_code.remove_symbol(td_list[3].xpath("string(.)")) temp["court"] = court notice_no = deal_html_code.remove_symbol( td_list[4].xpath("string(.)")) temp["notice_no"] = notice_no temp["enforce_no"] = notice_no status = deal_html_code.remove_symbol( td_list[5].xpath("string(.)")) temp["status"] = status onclik = td_list[6].xpath("./a/@onclick")[0] tuple = deal_html_code.match_key_content(str(onclik)) xh = tuple[0] lx = tuple[1] detail_url = self._url.format(self._pripid, lx, xh) self.get_deatail_info(detail_url, info) info[i] = temp return info
def name(self, data): info = {} if len(data) != 0: for i, single in enumerate(data): if "regNo" in single.keys(): code = single["regNo"] code = deal_html_code.remove_symbol(code) else: code = '' if "uniScid" in single.keys(): ccode = single["uniScid"] else: ccode = '' ccode = deal_html_code.remove_symbol(ccode) if "brName" in single.keys(): name = single["brName"] name = deal_html_code.remove_symbol(name) else: name = '' if "regOrganName" in single.keys(): gov_dept = single["regOrganName"] gov_dept = deal_html_code.remove_symbol(gov_dept) else: gov_dept = '' info[i] = [name, code, gov_dept, ccode] return info
def name(self, url): info = {} content, status_code = Send_Request().send_request(url) if status_code == 200: flag = 1 result = etree.HTML(content, parser=etree.HTMLParser(encoding="utf-8")) dlinfo = result.xpath("//div[@class ='viewBox']//dl")[0] dl = etree.tostring(dlinfo).split("<br/>") # 将最后一项的无用数据移除 dl.remove(dl[-1]) for i, single in enumerate(dl): single = etree.HTML(single, parser=etree.HTMLParser(encoding="utf-8")) name = deal_html_code.remove_symbol( single.xpath(".//dt")[0].xpath("string(.)")) templist = single.xpath('.//dd') types = deal_html_code.remove_symbol( templist[0].xpath("string(.)")) license_type = deal_html_code.remove_symbol( templist[1].xpath('string(.)')) license_code = deal_html_code.remove_symbol( templist[2].xpath('string(.)')) info[i] = [name, types, license_type, license_code] else: flag = 100000004 return info, flag
def get_info(self, data): info = {} for i, singledata in enumerate(data): temp = {} td_list = singledata.xpath("./td") if len(td_list) == 0: continue temp["equityNo"] = deal_html_code.remove_symbol( td_list[1].xpath("string(.)")) temp["pledgor"] = deal_html_code.remove_symbol( td_list[2].xpath("string(.)")) temp["pledBLicNo"] = deal_html_code.remove_symbol( td_list[3].xpath("string(.)")) temp["impAm"] = deal_html_code.remove_symbol( td_list[4].xpath("string(.)")) temp["impOrg"] = deal_html_code.remove_symbol( td_list[5].xpath("string(.)")) temp["impOrgBLicNo"] = deal_html_code.remove_symbol( td_list[6].xpath("string(.)")) equPleDate = deal_html_code.remove_symbol( td_list[7].xpath("string(.)")) temp["equPleDate"] = deal_html_code.change_chinese_date(equPleDate) publicDate = deal_html_code.remove_symbol( td_list[9].xpath("string(.)")) temp["type"] = deal_html_code.remove_symbol( td_list[8].xpath("string(.)")) temp["publicDate"] = deal_html_code.change_chinese_date(publicDate) info[i] = temp return info
def get_info(self, data): # data.xpath("//table[@id = 'table_ccjc']//tr[@name = 'ccjc']") info = {} for i, singledata in enumerate(data): td_list = singledata.xpath("//td") temp = {} # number = deal_html_code.remove_symbol(td_list[0].xpath("string(.)")) temp["gov_dept"] = deal_html_code.remove_symbol(td_list[1].xpath("string(.)")) temp["types"] = deal_html_code.remove_symbol(td_list[2].xpath("string(.)")) temp["check_date"] = deal_html_code.remove_symbol(td_list[3].xpath("string(.)")) temp["result"] = deal_html_code.remove_symbol(td_list[4].xpath("string(.)")) info[i] = temp return info
def get_info(self, data): tr_list = data.xpath(".//tr") info = {} for i, singledata in enumerate(tr_list): temp = {} td_list = singledata.xpath("./td") if len(td_list) == 0 or len(td_list) == 1: continue temp["types"] = deal_html_code.remove_symbol(td_list[1].xpath("string(.)")) valto = deal_html_code.remove_symbol(td_list[2].xpath("string(.)")) temp["valto"] = deal_html_code.change_chinese_date(valto) info[i] = temp return info
def get_info(self, data): info = {} tr_list = data.xpath(".//tr") for i, singledata in enumerate(tr_list): temp = {} td_list = singledata.xpath("./td") if len(td_list) == 0 or len(td_list) == 1: continue temp["name"] = deal_html_code.remove_symbol(td_list[1].xpath("string(.)")) temp["percent_pre"] = deal_html_code.remove_symbol(td_list[2].xpath("string(.)")) temp["percent_after"] = deal_html_code.remove_symbol(td_list[3].xpath("string(.)")) dates = deal_html_code.remove_symbol(td_list[4].xpath("string(.)")) temp["dates"] = deal_html_code.change_chinese_date(dates) info[i] = temp return info
def deal_tr_content(self, result, string): before_table = result.xpath(".//table[contains(.,'%s')]" % string)[0] trlist = before_table.xpath("./tr") trlist.remove(trlist[0]) trlist.remove(trlist[0]) string = '' for i, single in enumerate(trlist): temp = single.xpath("./td") text = deal_html_code.remove_symbol(temp[0].xpath("string(.)")) + " " + deal_html_code.remove_symbol( temp[1].xpath("string(.)")) if i == 0: string = string + text else: string = string + "||" + text return string
def get_person_info(self, data): tr_list = data.xpath(".//tr[@name = 'dydj']") info = {} for i, singledata in enumerate(tr_list): temp = {} td_list = singledata.xpath("./td") name = deal_html_code.remove_symbol(td_list[1].xpath("string(.)")) temp["name"] = name cert = deal_html_code.remove_symbol(td_list[2].xpath("string(.)")) temp["cert"] = cert number = deal_html_code.remove_symbol( td_list[3].xpath("string(.)")) temp["number"] = number info[i] = temp return info
def name(self, data): info = {} if len(data) > 0: for i, single in enumerate(data): if "altBe" in single.keys(): content_before = single["altBe"] else: content_before = '' if "altAf" in single.keys(): content_after = single["altAf"] content_after = deal_html_code.remove_symbol(content_after) else: content_after = '' if "altDate" in single.keys(): change_date = single["altDate"] change_date = deal_html_code.change_chinese_date( change_date) else: change_date = '0000-00-00' if "altItem" in single.keys(): item = single["altItem"] else: item = "" info[i] = [content_before, content_after, change_date, item] return info
def get_all_info(gs_basic_id, gs_search_id, info_list): Log().found_log(gs_basic_id, gs_search_id) info = class_dict["basic"]().get_info(info_list["basic"]) flag = class_dict["basic"]().update_to_db(info, gs_basic_id) print "basic:%s" % flag pripid = '28890' name = info["name"] for key, value in info_list.iteritems(): # 这两块信息较为特殊,拿出来单独处理 if key == "basic" or key == "report" or key == "report1": continue # 如果没有获取到对应的信息就跳过 if key not in info_list.keys(): continue Judge(pripid, name, config.dict_url[key]).update_info(key, class_dict[key], value, gs_basic_id) # 单独处理年报 tr_list = info_list["report"].xpath(".//tr") fill_data = {} for i, singledata in enumerate(tr_list): td_list = singledata.xpath(".//td") if len(td_list) == 0: continue fill_data[i] = deal_html_code.change_chinese_date( deal_html_code.remove_symbol(td_list[2].xpath("string(.)"))) if len(fill_data) == 0: print "report:-1" else: print "report:%s" % len(fill_data) SHX_report.main(info_list["report1"], fill_data, gs_basic_id)
def name(self, data): information = {} for i in xrange(len(data)): singledata = data[i] types = '经营异常' if "speCauseInterpreted" in singledata.keys(): in_reason = singledata["speCauseInterpreted"] else: in_reason = '' if 'abnTime' in singledata.keys(): in_date = singledata["abnTime"] in_date = deal_html_code.change_chinese_date(in_date) else: in_date = '0000-00-00' if "remExcpResInterpreted" in singledata.keys(): out_reason = singledata["remExcpResInterpreted"] out_reason = deal_html_code.remove_symbol(out_reason) else: out_reason = '' if 'remDate' in singledata.keys(): out_date = singledata["remDate"] out_date = deal_html_code.change_chinese_date(out_date) else: out_date = '0000-00-00' if "decOrgInterpreted" in singledata.keys(): gov_dept = singledata["decOrgInterpreted"] else: gov_dept = '' information[i] = [ types, in_reason, in_date, out_reason, out_date, gov_dept ] return information
def name(self, data): information = {} for i in xrange(len(data)): singledata = data[i] nodeNum = singledata["nodeNum"] ia_zch = singledata["regNum"] ia_flh = singledata["intCls"] ia_zcgg = singledata["regAnncIssue"] ia_servicelist = singledata["goodsCnName"] ia_servicelist = remove_symbol(ia_servicelist) begin = singledata["propertyBgnDate"] begin = change_date_style(begin) end = singledata["propertyEndDate"] end = change_date_style(end) if begin == '0000-0000-00' and end == '0000-00-00': ia_zyqqx = '' else: ia_zyqqx = begin + '至' + end ia_zcdate = singledata["regAnncDate"] ia_zcdate = change_date_style(ia_zcdate) tmImage = singledata["tmImage"] information[i] = [ ia_zch, ia_flh, ia_zcgg, ia_servicelist, ia_zyqqx, ia_zcdate, nodeNum, tmImage ] return information
def deal_single_info(self, i, single, cookies, item): # 文献标识 nrdAn = single.xpath(".//input[@name = 'nrdAnHidden']")[0].xpath( './@value')[0] # 文献唯一标识 cid = single.xpath(".//input[@name='idHidden']")[0].xpath( './@value')[0] sid = cid nrdPn = single.xpath(".//input[@name ='nrdPnHidden']")[0].xpath( './@value')[0] str = u'代理机构' agency = self.deal_info(str, single) str = u'代理人' agent = self.deal_info(str, single) str = u'申请号' code = self.deal_info(str, single) code = code.split('CN')[-1] str = u'申请日' app_date = self.deal_info(str, single) app_date = deal_html_code.change_date(app_date) str = u'申请(专利权)人' applicant = self.deal_info(str, single) address = single.xpath(".//input[@name ='appAddrHidden']")[0].xpath( './@value')[0] str = u'发明人' inventor = self.deal_info(str, single) str = u'IPC分类号' main_cate = self.deal_info(str, single) str = u'IPC分类号' sub_cate = self.deal_info(str, single) str = u'公开(公告)号' pub_code = self.deal_info(str, single) str = u'公开(公告)日' pub_date = self.deal_info(str, single) pub_date = deal_html_code.change_date(pub_date) str = u'优先权日' priority_date = self.deal_info(str, single) str = u'优先权号' priority_code = self.deal_info(str, single) priority = priority_date + ' ' + priority_code name = single.xpath(".//input[@name ='titleHidden']")[0].xpath( './@value')[0] name = deal_html_code.remove_symbol(name) remark = self.get_remark(nrdPn, sid, cid, cookies) source = 'pss-system' law_search_info = self.get_law_info(nrdAn, nrdPn, cookies) string = u'同族' finger = single.xpath(".//a[contains(.,'%s')]" % string)[0].xpath("string(.)") finger = finger.split(":")[-1] if int(finger) == 0: same_info = {} else: same_info = self.get_cognation_info(nrdPn, cookies) item[i] = [ name, code, app_date, applicant, address, inventor, main_cate, sub_cate, pub_code, pub_date, priority, remark, agent, agency, source, law_search_info, same_info ]
def deal_dd_content(string, result): dd = result.xpath(".//dt[contains(.,'%s')]" % string)[0].xpath("./following-sibling::*[1]") dd = dd[0] data = deal_html_code.remove_symbol(dd.xpath("string(.)")) return data
def main(): Bulid_Log.Log().found_log() try: HOST, USER, PASSWD, DB, PORT = config.HOST, config.USER, config.PASSWD, config.DB, config.PORT connect, cursor = Connect_to_DB().ConnectDB(HOST, USER, PASSWD, DB, PORT) count = cursor.execute(select_info) if count == 0: print "there is no task need to do!" else: for gs_new_id, name, province in cursor.fetchall(): print "now the gs_new_id is %s" % gs_new_id logging.info("now the gs_new id is %s" % gs_new_id) name = deal_html_code.remove_symbol(name) if province == 'SHH' or province == "HEB" or province == "SCH" or province == "YUN" or province == "JSU": print "the province is out of range" logging.info("the province is out of range") elif name == '': print "this is an useless information!" logging.info("this is an useless information!") else: info, flag = GetUrl.main(name) if flag == 1 and len(info) > 0: update_info(cursor, connect, info, gs_new_id, province) else: logging.info( "get cookies failed or there is no search information ,the status is %s" % flag) cursor.execute(update_status1, (gs_new_id, gs_new_id)) connect.commit() except Exception, e: logging.info("unknown error:%s" % e)
def get_info(self, data): info = {} tr_list = data.xpath(".//tr") for i, singledata in enumerate(tr_list): temp = {} td_list = singledata.xpath(".//td") if len(td_list) == 0: continue temp["name"] = deal_html_code.remove_symbol( td_list[1].xpath("string(.)")) temp["position"] = deal_html_code.remove_symbol( td_list[2].xpath("string(.)")) info[i] = temp return info
def get_info(code, ccode): pattern = re.compile(r'^9.*') result1 = re.findall(pattern, code) result2 = re.findall(pattern, ccode) if len(result1) == 0 and len(result2) == 0: string = code elif len(result1) == 1: string = code elif len(result2) == 1: string = ccode else: pass info, flag = get_list(string) if flag == 100000003: HOST, USER, PASSWD, DB, PORT = config.HOST, config.USER, config.PASSWD, config.DB, config.PORT connect, cursor = Connect_to_DB().ConnectDB(HOST, USER, PASSWD, DB, PORT) select_string = select_name % gs_basic_id cursor.execute(select_string) name = cursor.fetchall()[0][0] name = deal_html_code.remove_symbol(name) cursor.close() connect.close() info, flag = get_list(name) return info, flag
def name(self,data): info = {} if len(data)!=0: for i,single in enumerate(data): if "invName" in single.keys(): name = single["invName"] else: name = '' if "invTypeInterpreted" in single.keys(): types = single["invTypeInterpreted"] else: types = '' if single["blicTypeInterpreted"]!='': license_type = single["blicTypeInterpreted"] license_code = single["bLicNo"] elif single["cetfTypeInterpreted"]!='': license_type = single["cetfTypeInterpreted"] license_code = single["cetfId"] elif single["blicTypeInterpreted"]=='' and single["cetfTypeInterpreted"] =='': license_code = '' license_type = '' license_code = deal_html_code.remove_symbol(license_code) if "subconAm" in single.keys(): reg_amount = single["subconAm"] else: reg_amount = '' if "acconAm" in single.keys(): true_amount = single["acconAm"] else: true_amount = '' if "conDate" in single.keys(): ta_date = single["conDate"] ta_date = deal_html_code.change_chinese_date(ta_date) else: ta_date = '0000-00-00' if "conForm" in single.keys(): ta_ways = single["conForm"] else: ta_ways = '' if ta_ways == '1': ta_ways = '货币' if "countryInterpreted" in single.keys(): country = single["countryInterpreted"] else: country = '' if "dom" in single.keys(): address = single["dom"] else: address = '' encrypted = single["encrypted"] if "cetfType" in single.keys(): cetfType = single["cetfType"] elif "bLicType" in single.keys(): cetfType = single["bLicType"] else: cetfType = '' info[i] = [name, types, license_code, license_type, reg_amount, true_amount, ta_date, ta_ways, country, address, encrypted, cetfType] return info
def deal_table_info(self, result): string = u'变更时间' plist = result.xpath(".//p[contains(.,'%s')]" % string)[0] item = plist.xpath("./following-sibling::*[1]")[0] change_date = deal_html_code.remove_symbol(plist.xpath("string(.)")).split(u":")[-1] item = deal_html_code.remove_symbol(item.xpath("string(.)")).split(u":")[-1] if u"投资人" in item: item = "投资人" elif u"认缴的出资额" in item: item = "投资人" elif u"实缴的出资额" in item: item = "投资人" string = u'变更前' change_before = self.deal_tr_content(result, string) string = u"变更后" change_after = self.deal_tr_content(result, string) return change_date, item, change_before, change_after
def deal_single_info(self, items): info = {} url = items.xpath(".//a[@class='font16']/@href")[0] company = items.xpath(".//span[@class= 'rsfont']")[0].xpath("string(.)") company = deal_html_code.remove_symbol(company) status = items.xpath(".//span[@class= 'rsfont']/following-sibling::*[1]")[0].xpath("string(.)") status = deal_html_code.remove_symbol(status) tablelist = items.xpath(".//table[@class = 'textStyle']//span[@class = 'dataTextStyle']") code = tablelist[0].xpath("string(.)") code = deal_html_code.remove_symbol(code) legal_person = tablelist[1].xpath("string(.)") legal_person = deal_html_code.remove_symbol(legal_person) dates = tablelist[2].xpath("string(.)") dates = deal_html_code.change_chinese_date(dates) info[code] = [url,company,status,code,legal_person,dates] return info
def get_info(self, data): tr_list = data.xpath(".//tr") info = {} for i, singledata in enumerate(tr_list): temp = {} td_list = singledata.xpath("./td") # 针对表头和为信息的情况进行特殊对待 if len(td_list) == 0 or len(td_list) == 1: continue temp["name"] = deal_html_code.remove_symbol( td_list[1].xpath("string(.)")) reg_amount = deal_html_code.remove_symbol( td_list[2].xpath("string(.)")) temp["reg_amount"] = deal_html_code.match_float(reg_amount) reg_date = deal_html_code.remove_symbol( td_list[3].xpath("string(.)")) temp["reg_date"] = deal_html_code.change_chinese_date(reg_date) temp["reg_way"] = deal_html_code.remove_symbol( td_list[4].xpath("string(.)")) ac_amount = deal_html_code.remove_symbol( td_list[5].xpath("string(.)")) temp["ac_amount"] = deal_html_code.match_float(ac_amount) ac_date = deal_html_code.remove_symbol( td_list[6].xpath("string(.)")) temp["ac_date"] = deal_html_code.change_chinese_date(ac_date) temp["ac_way"] = deal_html_code.remove_symbol( td_list[7].xpath("string(.)")) info[i] = temp return info