def httpParserQiubai(url): ''' 糗百:title link description ''' content = httpRequest(url) #发送请求 parser = etree.XMLParser(strip_cdata=False) root = etree.XML(content, parser) descr = root.xpath(u"//description") title = root.xpath(u"//title") link = root.xpath(u"//link") pubDate = root.xpath(u"//pubDate") article = title[0].text content = [] counter = 0 for t in range(len(title)): if t != 0 and t <= 1: num = 2 - t newtime = datetime.datetime.strptime(pubDate[num].text[:24], "%a,%d %b %Y %H:%M:%S") newtime = newtime.strftime('%Y年%m月%d日 %H:%M:%S') arr = {} arr['article'] = article arr['title'] = title[num].text arr['link'] = link[num].text arr['pubdate'] = newtime arr['brief'] = "" arr['descr'] = descr[num].text arr['content'] = "" if counter == 0: status = Article.objects.filter(link=link[num].text).exists() if not status: RssData().keepData(arr) counter = 1 else: RssData().keepData(arr)
def httpParser5(url): ''' title link description content:encoded ''' content = httpRequest(url) #发送请求 parser = etree.XMLParser(strip_cdata=False) root = etree.XML(content, parser) soup = BeautifulSoup(content) item = soup.findAll('content:encoded') descr = root.xpath(u"//description") title = root.xpath(u"//title") link = root.xpath(u"//link") pubDate = root.xpath(u"//pubDate") article = title[0].text content = [] counter = 0 for t in range(len(title)): if t != 0 and t <= 9: num = 10 - t string = httpXpath(item[num - 1].contents[0].replace( '&', '&').replace('<', '<').replace('>', '>').replace('"', '"')) newtime = datetime.datetime.strptime(pubDate[num].text[:25], "%a, %d %b %Y %H:%M:%S") newtime = newtime.strftime('%Y年%m月%d日 %H:%M:%S') arr = {} arr['article'] = article arr['title'] = title[num].text arr['link'] = link[num].text arr['pubdate'] = newtime arr['brief'] = string arr['descr'] = "" arr['content'] = item[num - 1].contents[0].encode('utf-8').replace( '&', '&').replace('<', '<').replace('>', '>').replace('"', '"') if counter == 0: status = Article.objects.filter(link=link[num].text).exists() if not status: RssData().keepData(arr) counter = 1 else: RssData().keepData(arr)
def httpParserGuokr(url): ''' 果壳:content title id updated ''' content = httpRequest(url) #发送请求 soup = BeautifulSoup(content) item = soup.findAll('content') title = soup.findAll('title') link = soup.findAll('id') pubDate = soup.findAll('updated') article = title[0].text content = [] counter = 0 for t in range(len(title)): if t != 0 and t <= 6: num = 7 - t string = httpXpath(item[num - 1].contents[0].replace( '&', '&').replace('<', '<').replace('>', '>').replace('"', '"')) newtime = datetime.datetime.strptime(pubDate[num].contents[0][:10], "%Y-%m-%d") newtime = newtime.strftime('%Y年%m月%d日') arr = {} arr['article'] = article arr['title'] = title[num].contents[0] arr['link'] = link[num].contents[0] arr['pubdate'] = newtime arr['brief'] = string arr['descr'] = item[num - 1].contents[0].encode('utf-8').replace( '&', '&').replace('<', '<').replace('>', '>').replace('"', '"') arr['content'] = "" if counter == 0: status = Article.objects.filter(link=link[num].text).exists() if not status: RssData().keepData(arr) counter = 1 else: RssData().keepData(arr)