def web_getXLnks(url,ckn=10,kget=None,kflt=None,uget=None,uflt=None,ucod='gbk'): #rx= requests.get(url,headers=zt_headers) #获得网页,headers #print(url) df=pd.DataFrame(columns=['hdr','url']) rx=web_get001(url) if rx==None:return df # #rx.encoding =ucod #gb-18030 bs=BeautifulSoup(rx.text,'html5lib') # 'lxml' #bs=bs0.prettify('utf-8') xlnks=bs.find_all('a');#print(xlnks) ds=pd.Series(['',''],index=['hdr','url']) #print('\ncss,xss:',klnk,kflt) for lnk in xlnks: css,uss=lnk.text,lnk.get('href') #print('cs0,',css,uss) # if uflt!=None and uss!=None and zstr.str_xor(uss,uflt):uss=None if uget!=None and uss!=None and (not zstr.str_xor(uss,uget)):uss=None # if kflt!=None and uss!=None and zstr.str_xor(css,kflt):uss=None if kget!=None and uss!=None and (not zstr.str_xor(css,kget)):uss=None #print('cs2,',css,uss) # if uss==None:css='' css=zstr.str_fltHtmHdr(css) if len(css)>ckn: css=css.replace(',',',') #print('css,xss:',css,uss) ds['hdr'],ds['url']=css,uss df=df.append(ds.T,ignore_index=True) # #print(df) return df
def fb_gid_get4htm(htm): bs = BeautifulSoup(htm, 'html5lib') # 'lxml' df = pd.DataFrame(columns=tfsys.gidSgn, dtype=str) ds = pd.Series(tfsys.gidNil, index=tfsys.gidSgn, dtype=str) #---1# zsys.bs_get_ktag_kstr = 'isend' x10 = bs.find_all(zweb.bs_get_ktag) for xc, x in enumerate(x10): #print('\n@x\n',xc,'#',x.attrs) ds['gid'], ds['gset'] = x['fid'], zstr.str_fltHtmHdr(x['lg']) ds['mplay'] = zstr.str_fltHtmHdr(x['homesxname']) ds['gplay'] = zstr.str_fltHtmHdr(x['awaysxname']) ds['kend'] = x['isend'] s2 = ds['tweek'] = x['gdate'].split(' ')[0] #tweek ds['tweek'] = fb_tweekXed(s2) ds['tplay'], ds['tsell'] = x['pdate'], x['pendtime'] #tplay,tsell, # df = df.append(ds.T, ignore_index=True) #---2# x20 = bs.find_all('a', class_='score') for xc, x in enumerate(x20): xss = x['href'] kss = zstr.str_xmid(xss, 'ju-', '.sh') clst = x.text.split(':') # ds = df[df['gid'] == kss] ds = df[df['gid'] == kss] if len(ds) == 1: inx = ds.index df['qj'][inx] = clst[0] df['qs'][inx] = clst[1] kwin = fb_kwin4qnum(int(clst[0]), int(clst[1])) df['kwin'][inx] = str(kwin) #---3# x20 = bs.find_all('td', class_='left_team') if (len(x20) == len(x10)): for xc, x in enumerate(x20): #print('@x',xc,'#',x.a['href']) xss = x.a['href'] if xss.find('/team//') < 0: xid = zstr.str_xmid(xss, '/team/', '/') df['mtid'][xc] = xid g01 = df['gid'][xc] if xid == '': zt.f_addLog('tid-mtid,nil,' + xss + ',gid,' + g01) #---4# x20 = bs.find_all('td', class_='right_team') if (len(x20) == len(x10)): for xc, x in enumerate(x20): #print('@x',xc,'#',x.a['href']) xss = x.a['href'] if xss.find('/team//') < 0: xid = zstr.str_xmid(xss, '/team/', '/') df['gtid'][xc] = xid g01 = df['gid'][xc] if xid == '': zt.f_addLog('tid-gtid,nil,' + xss + ',gid,' + g01) #---5# df = df[df['gid'] != '-1'] return df