def web_getXTxt100(df,rs0,txtn0=200): for i, row in df.iterrows(): uss,hdr,txt=row['url'],row['hdr'],'' hdr = re.sub('[\\\/:*?"<>|]','-',hdr) if len(uss)>20:txt=web_getXTxt010x9(uss);#print('h2,',chdr) # if len(txt)>txtn0: rss=rs0 if not os.path.exists(rss):os.mkdir(rss) # fss=rss+hdr+'.txt';print(' ',fss) css=hdr+'\n'+uss+'\n\n'+txt #print(css,'\n',fss) zt.f_add(fss,css,True)
def web_get001txt(url,ucod='gb18030',ftg='',fcod='gbk'): htm,rx='',web_get001(url) if rx!=None: xcod=rx.apparent_encoding;#print(xcod,uss) rx.encoding =xcod #gb-18030 #dss=rx.text htm=rx.text;#print(htm) if xcod.upper()=='UTF-8': #print('@@u8a');#print(htm) htm=htm.replace(' ',' ') css = htm.encode("UTF-8",'ignore').decode("UTF-8",'ignore') css=css.replace(u'\xfffd ', u' ') css=css.replace(u'\xa0 ', u' ') htm = css.encode("GBK",'ignore').decode("GBK",'ignore') # if ftg!='':zt.f_add(ftg,htm,True,cod=fcod) # return htm