def parse_detail(url): global source_name global category global type global item_id global category_2 global type_name global typeId type = 'text' title = '' contents = '' img_url = '' img_urls = [] media_url = '' lrc_url = '' publish_time = datetime.strptime('2019-05-01 08:00:00', "%Y-%m-%d %H:%M:%S") headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36' } req = requests.get(url, headers=headers) req.encoding = 'utf-8' if 'video.qq' in req.text or 'v.qq.com' in req.text or 'player.youku.com' in req.text \ or 'player.video.qiyi.com' in req.text or 'player.pps.tv' in req.text or '.swf' in req.text: return False soup = BeautifulSoup(req.text, "html5lib") try: try: time_tag = soup.find_all('p', class_='fl p-left ml10') for ttag in time_tag: temp = ttag.text.strip() if u'年' in temp: temp = temp.replace(u'年', '-') temp = temp.replace(u'月', '-') temp = temp.replace(u'日', '') publish_time = datetime.strptime(temp, "%Y-%m-%d") except: publish_time = datetime.strptime('2019-05-01 09:00:00', "%Y-%m-%d %H:%M:%S") try: mp4str = re.findall('http.*?\.mp4', req.text) if mp4str: media_url = mp4str[0] type = 'video' if len(media_url) == 0: mp4str = re.findall('http.*?\.f4v', req.text) if mp4str: media_url = mp4str[0] type = 'video' except: pass div_tag = soup.select('div.tit-class-con h1') if div_tag: title = div_tag[0].text.strip() mp3_tag = soup.find('div', id='mp3') if mp3_tag: media_url = mp3_tag.text.strip() type = 'mp3' content_tag = soup.find('div', class_='arti-con rel') if content_tag: contents = contentUtil.getTingclassContent( content_tag.text.strip()) img_tags = soup.select('div.arti-con.rel img') for imgTag in img_tags: if imgTag.has_attr('src'): src = imgTag['src'] if ('n1image.hjfile.cn' in src or 'tingclass' in src): if 'statics/images/2014' not in src: img_url = src img_urls.append(src) if is_exit(title, url): pass print 'item exit' else: print title print img_url print img_urls print lrc_url print publish_time print media_url print source_name print category print type print category_2 print type_name print contents Composition = Object.extend('Reading') mComposition = Composition() mComposition.set('title', title) mComposition.set('img_url', img_url) mComposition.set('img_urls', img_urls) mComposition.set('img_type', 'url') mComposition.set('content', contents) mComposition.set('type_name', type_name) mComposition.set('publish_time', publish_time) mComposition.set('source_url', url) mComposition.set('type_id', typeId) mComposition.set('source_name', source_name) mComposition.set('category', category) mComposition.set('category_2', category_2) mComposition.set('lrc_url', lrc_url) mComposition.set('type', type) mComposition.set('media_url', media_url) mComposition.save() print 'save item' except: print traceback.format_exc() print url return
def parse_detail(url,img_url): global source_name global category global type global item_id global category_2 global type_name type = 'text' title = '' contents = '' img_urls = [] media_url = '' lrc_url = '' publish_time = datetime.strptime('2019-05-01 08:00:00', "%Y-%m-%d %H:%M:%S") headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'} req = requests.get(url, headers=headers) # req.encoding = 'utf-8' soup = BeautifulSoup(req.text, "html5lib") try: try: time_tag = soup.find('p',class_='main_title3') if time_tag: timeStr = time_tag.text.strip() result = re.search('\d+.+',timeStr) timeStr = result.group() publish_time = datetime.strptime(timeStr, "%Y-%m-%d %H:%M") # print publish_time except: # print traceback.format_exc() publish_time = datetime.strptime('2019-05-01 09:00:00', "%Y-%m-%d %H:%M:%S") mp3_tag = soup.find('audio') if mp3_tag: media_url = urlparse.urljoin(url,mp3_tag['src']) type = 'mp3' title_tag = soup.find('span',class_='main_title1') if title_tag: title = title_tag.text.strip() content_tag = soup.find('div',id='Content') if content_tag: contents = contentUtil.getTingclassContent(content_tag.text.strip()) img_urls.append(img_url) img_tags = content_tag.find_all('img') if img_tags: for imgTag in img_tags: if imgTag.has_attr('src'): src = urlparse.urljoin(url,imgTag['src']) img_urls.append(src) # if is_exit(url): pass # print 'item exit' else: # print title # print img_url # print img_urls # print lrc_url # print publish_time # print media_url # print source_name # print category # print type # print category_2 # print type_name # print contents Composition = Object.extend('Reading') mComposition = Composition() mComposition.set('title', title) mComposition.set('img_url', img_url) mComposition.set('img_urls', img_urls) mComposition.set('img_type', 'url') mComposition.set('content', contents) mComposition.set('type_name', type_name) mComposition.set('publish_time', publish_time) mComposition.set('type_id', '') mComposition.set('source_url', url) mComposition.set('source_name', source_name) mComposition.set('category', category) mComposition.set('category_2', category_2) mComposition.set('lrc_url', lrc_url) mComposition.set('type', type) mComposition.set('media_url', media_url) mComposition.save() # print 'save item' except: # print traceback.format_exc() # print url return