Beispiel #1
0
def news():
    # 大姨妈
    insert(
        script.capture(
            'https://news.meiyou.com/news-api/v2/web_news_more?category_id=16'
        ), 1)
    # 备孕
    insert(
        script.capture(
            'https://news.meiyou.com/news-api/v2/web_news_more?category_id=18'
        ), 2)
    # 育儿
    insert(
        script.capture(
            'https://news.meiyou.com/news-api/v2/web_news_more?category_id=19'
        ), 3)
    # 美妆
    insert(
        script.capture(
            'https://news.meiyou.com/news-api/v2/web_news_more?category_id=8'),
        4)
    # 健康
    insert(
        script.capture(
            'https://news.meiyou.com/news-api/v2/web_news_more?category_id=15'
        ), 5)
Beispiel #2
0
def detail(id):
    if id == None:
        print 'url 不能为 nil'
        return

    content = script.capture('http://www.dayima.com/articles/article/' + id)

    if content == "FAIL" or content is None:
        return '内容抓取失败'

    soup = BeautifulSoup(content)
    info = soup.find('div', class_='leftArea')
    if info == None:
        return script.error(id)

    titleTag = info.find('div', class_='artilce_title')
    newsContent = info.find('div', class_='article_content')
    newsContent = unicode(newsContent).replace("<br/>", "  ")
    bref = unicode(info.find('div', class_='article_brief')).replace("<br/>", " ")

    title = titleTag.string
    time = info.find('span', class_='artilce_time')
    sources = u'大姨妈'

    return script.insert_detail(id, title, bref + newsContent, sources, time.string)
Beispiel #3
0
def detail(id):
    if id is None:
        print 'url 不能为 nil'
        return

    content = script.capture('https://news.meiyou.com/news_detail?news_id=' +
                             str(id))
    content = content.replace('</html>', '')
    if content == "FAIL" or content is None:
        return '内容抓取失败'

    soup = BeautifulSoup(content)
    info = soup.find('div', class_='warp')

    if info == None:
        return script.error(id)

    warp_title = info.find('div', class_='warp-title')
    news_content = info.find('div', class_='news-content')
    news_content = unicode(news_content).replace("<br/>", "")

    title = warp_title.h2
    time = warp_title.find('span', class_='n-time')
    sources = warp_title.findAll('span')

    return script.insert_detail(id, title.string, news_content,
                                sources[-1].string, time.string)
Beispiel #4
0
def detail(id):
    try:
        content = script.capture('https://www.yidianzixun.com/article/' + id)
        soup = BeautifulSoup(content)

        wrapperTag = soup.find('div', class_='left-wrapper')
        if wrapperTag == None:
            return script.error(id)

        titleTag = wrapperTag.h2
        if titleTag == None:
            return script.error(id)

        metaTag = wrapperTag.find('div', class_='meta')
        sourceTag = metaTag.a
        timeTag = metaTag.span

        detail = wrapperTag.find('div', class_='content-bd')
        if detail == None:
            detail = wrapperTag.find('div', class_='video-wrapper')

        return script.insert_detail(id, titleTag.string, detail,
                                    sourceTag.string, timeTag.string)
    except:
        return None
Beispiel #5
0
def insert(url, type, cookie=''):
    headers = {
        'Referer': 'https://www.yidianzixun.com/channel/e136117',
        'cookie':
        'JSESSIONID=5b833bbd91a7574bbfed3af92d4b4817966f7198e7c2845b95639ad32dc64e3c;',
        'content-type': 'application/json; charset=utf-8'
    }

    try:
        content = script.capture(url, headers)
        content = json.loads(content)
    except:
        print url + ": 解析失败"
        return

    if not content.has_key('result'):
        return

    ids = []

    for news in content['result']:
        if news.has_key('content_type'
                        ) is not True or news['content_type'] is not 'news':
            continue
        news_id = news['itemid']
        title = news['title'].replace("\n", "")
        title = title.strip()
        sourceName = u'一点资讯'
        author = news['source']
        summary = news['summary']

        ico = ''
        if news.has_key('wemedia_info') and news['wemedia_info'].has_key(
                'image'):
            ico = news['wemedia_info']['image']

        imgs = []
        if news.has_key('image_urls'):
            for img in news['image_urls']:
                imgs.append(
                    'https://i1.go2yd.com/image.php?type=thumbnail_336x216&url='
                    + img)

        if title is None or news_id is None:
            continue

        script.insert_news(news_id, title, sourceName, SOURCE_HOST, author, 0,
                           ico, type, imgs, summary)

        if detail(news_id) == 1:
            ids.append(news_id)

    print '--------------insert yidianzixun type:' + str(
        type) + ' count:' + str(len(
            content['result'])) + '-----------------------------------'

    script.appendIDs(ids)
Beispiel #6
0
def detail(id):
    if id is None:
        print 'url 不能为 nil'
        return

    content = script.capture('http://www.sohu.com/a/' + str(id))
    if content == "FAIL" or content is None:
        print '内容抓取失败'
        return

    soup = BeautifulSoup(content)
    info = soup.find('div', class_='text')

    if info is None:
        return script.error(id)

    headerTag = info.find('div', class_='text-title')

    titles = headerTag.find('h1').stripped_strings
    for t in titles:
        title = t
        break

    author = u'搜狐新闻'

    try:
        sourceTag = headerTag.find('div', class_='article-info')
        timeTag = sourceTag.find('span', class_='time')
        authorsTag = sourceTag.find('span', class_='tag').findAll('a')
        if len(authorsTag) >= 0:
            author = authorsTag[-1].string
    except:
        pass

    try:
        news_content = info.find('div', class_='article')
        if news_content is None:
            news_content = info.find('article', class_='article')

        news_content.find('span', class_='backword').extract()
        news_content = unicode(news_content).replace("<br/>", "")

        return script.insert_detail(id, title, news_content, author,
                                    timeTag.string)
    except:
        return None
Beispiel #7
0
def news():
    url = 'http://www.dayima.com/articles'
    content = script.capture(url)

    if content is "FAIL" or content is None:
        return 'invalid path'

    soup = BeautifulSoup(content)
    articles = soup.findAll('div', class_='dotted')

    ids = []

    for article in articles:
        title = article.find('div', class_='title')
        aTag = title.a

        url = aTag.get('href')
        name = aTag.string

        query = url.split('/')
        news_id = query[-1]

        pic = article.find('div', class_='picArea')
        imgTag = pic.img

        read_count = 0
        source_name = u'大姨妈'
        source_ico = ''

        script.insert_news(news_id, name, source_name, SOURCE_HOST, '', read_count, source_ico, 1, (imgTag['src'],))

        if detail(news_id) == 1:
            ids.append(news_id)

    print '----------------------- insert dayima count: ' + str(len(articles)) + '  -----------------------------'

    script.appendIDs(ids)
Beispiel #8
0
def insert(url, type):
    content = script.capture(url)
    list = json.loads(content)
    ids = []
    for item in list:
        title = item['title'].replace("\n", "")
        title = title.strip()
        authorURL = ''
        if item['authorPic'] is not None and len(item['authorPic']) > 10:
            authorURL = 'https:' + item['authorPic']

        news_id = str(item['id']) + '_' + str(item['authorId'])

        script.insert_news(news_id, title, u'搜狐新闻', SOURCE_HOST,
                           item['authorName'], 0, authorURL, type,
                           item['images'])

        if detail(news_id) == 1:
            ids.append(news_id)

    print '----------------------- insert souhu type:' + str(
        type) + ' count:' + str(len(list)) + '  -----------------------------'

    script.appendIDs(ids)