Ejemplo n.º 1
0
def parseUrl(newsUrl):
    print(newsUrl)
    encoding = 'utf-8'
    if newsUrl.find('yule') > 0:
        encoding = 'gb2312'
        r = synonym.getHtml(newsUrl,headers,encoding)
    else:      
        r = synonym.getHtml(newsUrl,headers,encoding)
    soup = r[1]
    title = soup.find('title')
    title =re.sub(r'[\/:*?"<>|]', "", title.text)
    if not os.path.exists(title):
        os.makedirs(title)
    synonym.downloadText(r[0],title + '/index.html',encoding)
    if newsUrl.find('yule') > 0:
        content = soup.find('div',id='contentText')
    else:
        content = soup.find('article',id='mp-editor')
    #print(content.text)
    imgs = content.select('img')
    synonym.downloadText(content.text,title + '/src.txt',encoding)
    files = []
    for n in range(len(imgs)):
        url = imgs[n]['src']
        if not url.startswith('http'):
            url = 'http:'+ url
        synonym.downloadImg(url, title + '/' + str(n) +'.jpg')
        files.append(title + '/' + str(n) +'.jpg')
    files.append(title + '/src.txt')
    synonym.saveUrl(newsUrl,content.text,'sohu')
    return(title,content.text,files)
Ejemplo n.º 2
0
def getNewList(url):
    print(url)
    r = synonym.getHtml(url, headers, 'gb2312')
    result = r[0]
    newList = result[result.find('ent:') + 5:-3]
    #json.loads(newList)
    pattern = re.compile(r'http://ent.163.com/.*?\.html')
    result = pattern.findall(newList)
    return result
Ejemplo n.º 3
0
def getNewList(url):
    #print(url)
    r = synonym.getHtml(url,headers,'utf-8')
    result = r[0] 
    newList = result[result.find('item:') + 5:-1]
    listsohu = json.loads(newList)
    result = []
    for s in listsohu:
        #print(s[2])
        result.append(s[2])
    return result
Ejemplo n.º 4
0
def getNewList(url):
    baseurl = 'http://ent.cri.cn'
    print(url)
    r = synonym.getHtml(url,headers,'utf-8')
    soup = r[1] 
    #print(soup)
    body = soup.find_all('ul')
    #print(body[1])
    lis = body[1].find_all('li')
    result = []
    #print(lis[0])
    for s in lis:
        a = s.find('a')
        result.append(baseurl + a.get('href'))
    return result
Ejemplo n.º 5
0
def parseUrl(newsUrl):
    #print(newsUrl)
    r = synonym.getHtml(newsUrl, headers, 'gb2312')
    soup = r[1]
    title = soup.find('title')
    title = re.sub(r'[\/:*?"<>|]', "", title.text)
    if not os.path.exists(title):
        os.makedirs(title)
    synonym.downloadText(r[0], title + '/index.html', 'utf-8')
    content = soup.find('div', id='endText')
    #print(content.text)
    imgs = content.select('img')
    synonym.downloadText(content.text, title + '/src.txt', 'utf-8')
    files = []
    for n in range(len(imgs)):
        url = imgs[n]['src']
        if not url.endswith('end_ent.png'):
            synonym.downloadImg(url, title + '/' + str(n) + '.jpg')
            files.append(title + '/' + str(n) + '.jpg')
    files.append(title + '/src.txt')
    synonym.saveUrl(newsUrl, content.text, '163')
    return (title, content.text, files)