Beispiel #1
0
def web_getXLnks(url,ckn=10,kget=None,kflt=None,uget=None,uflt=None,ucod='gbk'):
    #rx= requests.get(url,headers=zt_headers)  #获得网页,headers
    #print(url)
    df=pd.DataFrame(columns=['hdr','url'])
    rx=web_get001(url)
    if rx==None:return df
    #
    #rx.encoding =ucod #gb-18030
    bs=BeautifulSoup(rx.text,'html5lib') # 'lxml'
    #bs=bs0.prettify('utf-8')
    xlnks=bs.find_all('a');#print(xlnks)
    
    ds=pd.Series(['',''],index=['hdr','url'])
    #print('\ncss,xss:',klnk,kflt)
    for lnk in xlnks:
        css,uss=lnk.text,lnk.get('href')
        #print('cs0,',css,uss)
        #
        if uflt!=None and uss!=None and zstr.str_xor(uss,uflt):uss=None
        if uget!=None and uss!=None and (not zstr.str_xor(uss,uget)):uss=None
        #
        if kflt!=None and uss!=None and zstr.str_xor(css,kflt):uss=None
        if kget!=None and uss!=None and (not zstr.str_xor(css,kget)):uss=None
        #print('cs2,',css,uss)
        #
        if uss==None:css=''
        css=zstr.str_fltHtmHdr(css)
        if len(css)>ckn:
            css=css.replace(',',',')
            #print('css,xss:',css,uss)
            ds['hdr'],ds['url']=css,uss
            df=df.append(ds.T,ignore_index=True)
    #
    #print(df)    
    return df    
Beispiel #2
0
def fb_gid_get4htm(htm):
    bs = BeautifulSoup(htm, 'html5lib')  # 'lxml'
    df = pd.DataFrame(columns=tfsys.gidSgn, dtype=str)
    ds = pd.Series(tfsys.gidNil, index=tfsys.gidSgn, dtype=str)

    #---1#
    zsys.bs_get_ktag_kstr = 'isend'
    x10 = bs.find_all(zweb.bs_get_ktag)
    for xc, x in enumerate(x10):
        #print('\n@x\n',xc,'#',x.attrs)
        ds['gid'], ds['gset'] = x['fid'], zstr.str_fltHtmHdr(x['lg'])
        ds['mplay'] = zstr.str_fltHtmHdr(x['homesxname'])
        ds['gplay'] = zstr.str_fltHtmHdr(x['awaysxname'])
        ds['kend'] = x['isend']
        s2 = ds['tweek'] = x['gdate'].split(' ')[0]  #tweek
        ds['tweek'] = fb_tweekXed(s2)
        ds['tplay'], ds['tsell'] = x['pdate'], x['pendtime']  #tplay,tsell,
        #
        df = df.append(ds.T, ignore_index=True)

    #---2#
    x20 = bs.find_all('a', class_='score')
    for xc, x in enumerate(x20):
        xss = x['href']
        kss = zstr.str_xmid(xss, 'ju-', '.sh')
        clst = x.text.split(':')
        #
        ds = df[df['gid'] == kss]
        ds = df[df['gid'] == kss]
        if len(ds) == 1:
            inx = ds.index
            df['qj'][inx] = clst[0]
            df['qs'][inx] = clst[1]
            kwin = fb_kwin4qnum(int(clst[0]), int(clst[1]))
            df['kwin'][inx] = str(kwin)

    #---3#
    x20 = bs.find_all('td', class_='left_team')
    if (len(x20) == len(x10)):
        for xc, x in enumerate(x20):
            #print('@x',xc,'#',x.a['href'])
            xss = x.a['href']
            if xss.find('/team//') < 0:
                xid = zstr.str_xmid(xss, '/team/', '/')
                df['mtid'][xc] = xid
                g01 = df['gid'][xc]
                if xid == '':
                    zt.f_addLog('tid-mtid,nil,' + xss + ',gid,' + g01)
    #---4#
    x20 = bs.find_all('td', class_='right_team')
    if (len(x20) == len(x10)):
        for xc, x in enumerate(x20):
            #print('@x',xc,'#',x.a['href'])
            xss = x.a['href']
            if xss.find('/team//') < 0:
                xid = zstr.str_xmid(xss, '/team/', '/')
                df['gtid'][xc] = xid
                g01 = df['gid'][xc]
                if xid == '':
                    zt.f_addLog('tid-gtid,nil,' + xss + ',gid,' + g01)

    #---5#
    df = df[df['gid'] != '-1']
    return df