Esempio n. 1
0
def parse_detail(url):
    global source_name
    global category
    global type
    global item_id
    global category_2
    global type_name
    global typeId
    type = 'text'
    title = ''
    contents = ''
    img_url = ''
    img_urls = []
    media_url = ''
    lrc_url = ''
    publish_time = datetime.strptime('2019-05-01 08:00:00',
                                     "%Y-%m-%d %H:%M:%S")
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
    }
    req = requests.get(url, headers=headers)
    req.encoding = 'utf-8'
    if 'video.qq' in req.text or 'v.qq.com' in req.text or 'player.youku.com' in req.text \
            or 'player.video.qiyi.com' in req.text or 'player.pps.tv' in req.text or '.swf' in req.text:
        return False
    soup = BeautifulSoup(req.text, "html5lib")
    try:
        try:
            time_tag = soup.find_all('p', class_='fl p-left ml10')
            for ttag in time_tag:
                temp = ttag.text.strip()
                if u'年' in temp:
                    temp = temp.replace(u'年', '-')
                    temp = temp.replace(u'月', '-')
                    temp = temp.replace(u'日', '')
                    publish_time = datetime.strptime(temp, "%Y-%m-%d")
        except:
            publish_time = datetime.strptime('2019-05-01 09:00:00',
                                             "%Y-%m-%d %H:%M:%S")

        try:
            mp4str = re.findall('http.*?\.mp4', req.text)
            if mp4str:
                media_url = mp4str[0]
                type = 'video'
            if len(media_url) == 0:
                mp4str = re.findall('http.*?\.f4v', req.text)
                if mp4str:
                    media_url = mp4str[0]
                    type = 'video'
        except:
            pass

        div_tag = soup.select('div.tit-class-con h1')
        if div_tag:
            title = div_tag[0].text.strip()

        mp3_tag = soup.find('div', id='mp3')
        if mp3_tag:
            media_url = mp3_tag.text.strip()
            type = 'mp3'

        content_tag = soup.find('div', class_='arti-con rel')
        if content_tag:
            contents = contentUtil.getTingclassContent(
                content_tag.text.strip())

        img_tags = soup.select('div.arti-con.rel img')
        for imgTag in img_tags:
            if imgTag.has_attr('src'):
                src = imgTag['src']
                if ('n1image.hjfile.cn' in src or 'tingclass' in src):
                    if 'statics/images/2014' not in src:
                        img_url = src
                        img_urls.append(src)

        if is_exit(title, url):
            pass
            print 'item exit'
        else:
            print title
            print img_url
            print img_urls
            print lrc_url
            print publish_time
            print media_url
            print source_name
            print category
            print type
            print category_2
            print type_name
            print contents
            Composition = Object.extend('Reading')
            mComposition = Composition()
            mComposition.set('title', title)
            mComposition.set('img_url', img_url)
            mComposition.set('img_urls', img_urls)
            mComposition.set('img_type', 'url')
            mComposition.set('content', contents)
            mComposition.set('type_name', type_name)
            mComposition.set('publish_time', publish_time)
            mComposition.set('source_url', url)
            mComposition.set('type_id', typeId)
            mComposition.set('source_name', source_name)
            mComposition.set('category', category)
            mComposition.set('category_2', category_2)
            mComposition.set('lrc_url', lrc_url)
            mComposition.set('type', type)
            mComposition.set('media_url', media_url)
            mComposition.save()
            print 'save item'
    except:
        print traceback.format_exc()
        print url
        return
Esempio n. 2
0
def parse_detail(url,img_url):
    global source_name
    global category
    global type
    global item_id
    global category_2
    global type_name
    type = 'text'
    title = ''
    contents = ''
    img_urls = []
    media_url = ''
    lrc_url = ''
    publish_time = datetime.strptime('2019-05-01 08:00:00', "%Y-%m-%d %H:%M:%S")
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'}
    req = requests.get(url, headers=headers)
    # req.encoding = 'utf-8'
    soup = BeautifulSoup(req.text, "html5lib")
    try:
        try:
            time_tag = soup.find('p',class_='main_title3')
            if time_tag:
                timeStr = time_tag.text.strip()
                result = re.search('\d+.+',timeStr)
                timeStr = result.group()
                publish_time = datetime.strptime(timeStr, "%Y-%m-%d %H:%M")
                # print publish_time
        except:
            # print traceback.format_exc()
            publish_time = datetime.strptime('2019-05-01 09:00:00', "%Y-%m-%d %H:%M:%S")

        mp3_tag = soup.find('audio')
        if mp3_tag:
            media_url = urlparse.urljoin(url,mp3_tag['src'])
            type = 'mp3'

        title_tag = soup.find('span',class_='main_title1')
        if title_tag:
            title = title_tag.text.strip()
        content_tag = soup.find('div',id='Content')
        if content_tag:
            contents = contentUtil.getTingclassContent(content_tag.text.strip())

        img_urls.append(img_url)
        img_tags = content_tag.find_all('img')
        if img_tags:
            for imgTag in img_tags:
                if imgTag.has_attr('src'):
                    src = urlparse.urljoin(url,imgTag['src'])
                    img_urls.append(src)
    #
        if is_exit(url):
            pass
            # print 'item exit'
        else:
            # print title
            # print img_url
            # print img_urls
            # print lrc_url
            # print publish_time
            # print media_url
            # print source_name
            # print category
            # print type
            # print category_2
            # print type_name
            # print contents
            Composition = Object.extend('Reading')
            mComposition = Composition()
            mComposition.set('title', title)
            mComposition.set('img_url', img_url)
            mComposition.set('img_urls', img_urls)
            mComposition.set('img_type', 'url')
            mComposition.set('content', contents)
            mComposition.set('type_name', type_name)
            mComposition.set('publish_time', publish_time)
            mComposition.set('type_id', '')
            mComposition.set('source_url', url)
            mComposition.set('source_name', source_name)
            mComposition.set('category', category)
            mComposition.set('category_2', category_2)
            mComposition.set('lrc_url', lrc_url)
            mComposition.set('type', type)
            mComposition.set('media_url', media_url)
            mComposition.save()
            # print 'save item'
    except:
        # print traceback.format_exc()
        # print url
        return