コード例 #1
0
ファイル: parser.py プロジェクト: jzxyouok/readdaily
def httpParserQiubai(url):
    '''
	糗百:title link description 
	'''
    content = httpRequest(url)  #发送请求
    parser = etree.XMLParser(strip_cdata=False)
    root = etree.XML(content, parser)
    descr = root.xpath(u"//description")
    title = root.xpath(u"//title")
    link = root.xpath(u"//link")
    pubDate = root.xpath(u"//pubDate")
    article = title[0].text
    content = []
    counter = 0
    for t in range(len(title)):
        if t != 0 and t <= 1:
            num = 2 - t
            newtime = datetime.datetime.strptime(pubDate[num].text[:24],
                                                 "%a,%d %b %Y %H:%M:%S")
            newtime = newtime.strftime('%Y年%m月%d日 %H:%M:%S')
            arr = {}
            arr['article'] = article
            arr['title'] = title[num].text
            arr['link'] = link[num].text
            arr['pubdate'] = newtime
            arr['brief'] = ""
            arr['descr'] = descr[num].text
            arr['content'] = ""
            if counter == 0:
                status = Article.objects.filter(link=link[num].text).exists()
                if not status:
                    RssData().keepData(arr)
                    counter = 1
            else:
                RssData().keepData(arr)
コード例 #2
0
ファイル: parser.py プロジェクト: jzxyouok/readdaily
def httpParser5(url):
    '''
	title link description content:encoded
	'''
    content = httpRequest(url)  #发送请求
    parser = etree.XMLParser(strip_cdata=False)
    root = etree.XML(content, parser)
    soup = BeautifulSoup(content)
    item = soup.findAll('content:encoded')
    descr = root.xpath(u"//description")
    title = root.xpath(u"//title")
    link = root.xpath(u"//link")
    pubDate = root.xpath(u"//pubDate")
    article = title[0].text
    content = []
    counter = 0
    for t in range(len(title)):
        if t != 0 and t <= 9:
            num = 10 - t
            string = httpXpath(item[num - 1].contents[0].replace(
                '&amp;', '&').replace('&lt;',
                                      '<').replace('&gt;',
                                                   '>').replace('&quot;', '"'))
            newtime = datetime.datetime.strptime(pubDate[num].text[:25],
                                                 "%a, %d %b %Y %H:%M:%S")
            newtime = newtime.strftime('%Y年%m月%d日 %H:%M:%S')
            arr = {}
            arr['article'] = article
            arr['title'] = title[num].text
            arr['link'] = link[num].text
            arr['pubdate'] = newtime
            arr['brief'] = string
            arr['descr'] = ""
            arr['content'] = item[num - 1].contents[0].encode('utf-8').replace(
                '&amp;', '&').replace('&lt;',
                                      '<').replace('&gt;',
                                                   '>').replace('&quot;', '"')
            if counter == 0:
                status = Article.objects.filter(link=link[num].text).exists()
                if not status:
                    RssData().keepData(arr)
                    counter = 1
            else:
                RssData().keepData(arr)
コード例 #3
0
ファイル: parser.py プロジェクト: jzxyouok/readdaily
def httpParserGuokr(url):
    '''
	果壳:content title id updated
	'''
    content = httpRequest(url)  #发送请求
    soup = BeautifulSoup(content)
    item = soup.findAll('content')
    title = soup.findAll('title')
    link = soup.findAll('id')
    pubDate = soup.findAll('updated')
    article = title[0].text
    content = []
    counter = 0
    for t in range(len(title)):
        if t != 0 and t <= 6:
            num = 7 - t
            string = httpXpath(item[num - 1].contents[0].replace(
                '&amp;', '&').replace('&lt;',
                                      '<').replace('&gt;',
                                                   '>').replace('&quot;', '"'))
            newtime = datetime.datetime.strptime(pubDate[num].contents[0][:10],
                                                 "%Y-%m-%d")
            newtime = newtime.strftime('%Y年%m月%d日')
            arr = {}
            arr['article'] = article
            arr['title'] = title[num].contents[0]
            arr['link'] = link[num].contents[0]
            arr['pubdate'] = newtime
            arr['brief'] = string
            arr['descr'] = item[num - 1].contents[0].encode('utf-8').replace(
                '&amp;', '&').replace('&lt;',
                                      '<').replace('&gt;',
                                                   '>').replace('&quot;', '"')
            arr['content'] = ""
            if counter == 0:
                status = Article.objects.filter(link=link[num].text).exists()
                if not status:
                    RssData().keepData(arr)
                    counter = 1
            else:
                RssData().keepData(arr)