Ejemplo n.º 1
0
def parseUrl(newsUrl):
    print(newsUrl)
    encoding = 'utf-8'
    if newsUrl.find('yule') > 0:
        encoding = 'gb2312'
        r = synonym.getHtml(newsUrl,headers,encoding)
    else:      
        r = synonym.getHtml(newsUrl,headers,encoding)
    soup = r[1]
    title = soup.find('title')
    title =re.sub(r'[\/:*?"<>|]', "", title.text)
    if not os.path.exists(title):
        os.makedirs(title)
    synonym.downloadText(r[0],title + '/index.html',encoding)
    if newsUrl.find('yule') > 0:
        content = soup.find('div',id='contentText')
    else:
        content = soup.find('article',id='mp-editor')
    #print(content.text)
    imgs = content.select('img')
    synonym.downloadText(content.text,title + '/src.txt',encoding)
    files = []
    for n in range(len(imgs)):
        url = imgs[n]['src']
        if not url.startswith('http'):
            url = 'http:'+ url
        synonym.downloadImg(url, title + '/' + str(n) +'.jpg')
        files.append(title + '/' + str(n) +'.jpg')
    files.append(title + '/src.txt')
    synonym.saveUrl(newsUrl,content.text,'sohu')
    return(title,content.text,files)
Ejemplo n.º 2
0
def run():

    newslist = getNewList('http://ent.163.com/special/00032IAD/ent_json.js')
    for url in newslist:
        try:
            r = synonym.getByUrl(url)
            #print(r)
            if r is not None:  #没有是None
                continue
            news = parseUrl(url)
            text = bdnlp.nplParse(news[1])
            synonym.downloadText(text, news[0] + '/dest.txt', 'utf-8')
            files = news[2]
            files.append(news[0] + '/dest.txt')
            SendMail.mail(SendMail, news[0], news[1] + '\n' + text, files)
        except:
            traceback.print_exc()
            pass
Ejemplo n.º 3
0
def run():
    newslist = getNewList('http://yule.sohu.com/_scroll_newslist/%s/news.inc' %(getToday()))
    for url in newslist:
        try:
            if url.find('picture') > 0 : #组图 or url.find('music') > 0
                continue
            r = synonym.getByUrl(url)
             #print(r)
            if r is not None: #没有是None
                continue
            news = parseUrl(url)
            text = bdnlp.nplParse(news[1])
            synonym.downloadText(text,news[0] + '/dest.txt','utf-8')
            files = news[2]
            files.append(news[0] + '/dest.txt')
            SendMail.mail(SendMail,news[0],news[1] + '\n' + text,files)
        except:
            traceback.print_exc()
            pass
Ejemplo n.º 4
0
def parseUrl(newsUrl):
    #print(newsUrl)
    r = synonym.getHtml(newsUrl, headers, 'gb2312')
    soup = r[1]
    title = soup.find('title')
    title = re.sub(r'[\/:*?"<>|]', "", title.text)
    if not os.path.exists(title):
        os.makedirs(title)
    synonym.downloadText(r[0], title + '/index.html', 'utf-8')
    content = soup.find('div', id='endText')
    #print(content.text)
    imgs = content.select('img')
    synonym.downloadText(content.text, title + '/src.txt', 'utf-8')
    files = []
    for n in range(len(imgs)):
        url = imgs[n]['src']
        if not url.endswith('end_ent.png'):
            synonym.downloadImg(url, title + '/' + str(n) + '.jpg')
            files.append(title + '/' + str(n) + '.jpg')
    files.append(title + '/src.txt')
    synonym.saveUrl(newsUrl, content.text, '163')
    return (title, content.text, files)
Ejemplo n.º 5
0
def run(cat):
    try:
        newslist = getNewList('http://ent.cri.cn/roll/' + cat)
        for url in newslist:
            try:
                if url.find('picture') > 0: #组图
                    continue
                r = synonym.getByUrl(url)
                #print(r)
                if r is not None: #没有是None
                    continue
                news = parseUrl(url)
                text = bdnlp.nplParse(news[1])
                synonym.downloadText(text,news[0] + '/dest.txt','utf-8')
                files = news[2]
                files.append(news[0] + '/dest.txt')
                SendMail.mail(SendMail,news[0],news[1] + '\n' + text,files)
            except:
                traceback.print_exc()
                pass
            #break
    except:
        pass