Exemple #1
0
def web_getXTxt100(df,rs0,txtn0=200):
    for i, row in df.iterrows():
        uss,hdr,txt=row['url'],row['hdr'],''
        hdr = re.sub('[\\\/:*?"<>|]','-',hdr)          
        if len(uss)>20:txt=web_getXTxt010x9(uss);#print('h2,',chdr)
        #
        if len(txt)>txtn0:
            
            rss=rs0
            if not os.path.exists(rss):os.mkdir(rss)
            #
            fss=rss+hdr+'.txt';print('    ',fss)
            css=hdr+'\n'+uss+'\n\n'+txt
            #print(css,'\n',fss)
            zt.f_add(fss,css,True)
Exemple #2
0
def web_get001txt(url,ucod='gb18030',ftg='',fcod='gbk'):
    htm,rx='',web_get001(url)
    if rx!=None:
        xcod=rx.apparent_encoding;#print(xcod,uss)
        rx.encoding =xcod  #gb-18030
        #dss=rx.text
        htm=rx.text;#print(htm)
        if xcod.upper()=='UTF-8':
            #print('@@u8a');#print(htm)
            htm=htm.replace('&nbsp;',' ')
            css = htm.encode("UTF-8",'ignore').decode("UTF-8",'ignore')
            css=css.replace(u'\xfffd ', u' ')
            css=css.replace(u'\xa0 ', u' ')
            htm = css.encode("GBK",'ignore').decode("GBK",'ignore')
        #
        if ftg!='':zt.f_add(ftg,htm,True,cod=fcod)
    #
    return htm