def parseUrl(newsUrl): print(newsUrl) encoding = 'utf-8' if newsUrl.find('yule') > 0: encoding = 'gb2312' r = synonym.getHtml(newsUrl,headers,encoding) else: r = synonym.getHtml(newsUrl,headers,encoding) soup = r[1] title = soup.find('title') title =re.sub(r'[\/:*?"<>|]', "", title.text) if not os.path.exists(title): os.makedirs(title) synonym.downloadText(r[0],title + '/index.html',encoding) if newsUrl.find('yule') > 0: content = soup.find('div',id='contentText') else: content = soup.find('article',id='mp-editor') #print(content.text) imgs = content.select('img') synonym.downloadText(content.text,title + '/src.txt',encoding) files = [] for n in range(len(imgs)): url = imgs[n]['src'] if not url.startswith('http'): url = 'http:'+ url synonym.downloadImg(url, title + '/' + str(n) +'.jpg') files.append(title + '/' + str(n) +'.jpg') files.append(title + '/src.txt') synonym.saveUrl(newsUrl,content.text,'sohu') return(title,content.text,files)
def getNewList(url): print(url) r = synonym.getHtml(url, headers, 'gb2312') result = r[0] newList = result[result.find('ent:') + 5:-3] #json.loads(newList) pattern = re.compile(r'http://ent.163.com/.*?\.html') result = pattern.findall(newList) return result
def getNewList(url): #print(url) r = synonym.getHtml(url,headers,'utf-8') result = r[0] newList = result[result.find('item:') + 5:-1] listsohu = json.loads(newList) result = [] for s in listsohu: #print(s[2]) result.append(s[2]) return result
def getNewList(url): baseurl = 'http://ent.cri.cn' print(url) r = synonym.getHtml(url,headers,'utf-8') soup = r[1] #print(soup) body = soup.find_all('ul') #print(body[1]) lis = body[1].find_all('li') result = [] #print(lis[0]) for s in lis: a = s.find('a') result.append(baseurl + a.get('href')) return result
def parseUrl(newsUrl): #print(newsUrl) r = synonym.getHtml(newsUrl, headers, 'gb2312') soup = r[1] title = soup.find('title') title = re.sub(r'[\/:*?"<>|]', "", title.text) if not os.path.exists(title): os.makedirs(title) synonym.downloadText(r[0], title + '/index.html', 'utf-8') content = soup.find('div', id='endText') #print(content.text) imgs = content.select('img') synonym.downloadText(content.text, title + '/src.txt', 'utf-8') files = [] for n in range(len(imgs)): url = imgs[n]['src'] if not url.endswith('end_ent.png'): synonym.downloadImg(url, title + '/' + str(n) + '.jpg') files.append(title + '/' + str(n) + '.jpg') files.append(title + '/src.txt') synonym.saveUrl(newsUrl, content.text, '163') return (title, content.text, files)