def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod uudistesaidi kõigi uudiste nimekirja loomiseks """ articleDescriptions = [] articleIds = [] articleImages = [] articlePubDates = [] articleTitles = pageTree.xpath('//div[@class="forum"][2]/ul/li/a/text()') articleUrls = pageTree.xpath('//div[@class="forum"][2]/ul/li/a/@href') articleUrls = parsers_common.domainUrls(domain, articleUrls) get_article_bodies = True for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # get unique id from articleUrl articleIds.append( articleUrl[articleUrl.index('&id=') + 4:articleUrl.index('&', articleUrl.index('&id=') + 4)]) if (get_article_bodies is True and i < maxPageURLstoVisit): # load article into tree articleTree = makereq.getArticleData(articleUrl) # descriptions # curArtDescParent = parsers_common.treeExtract(articleTree, '//div[@id="content"]/div[@class="full_width"]/p[1]') # as a parent # curArtDescChilds = parsers_common.stringify_children(curArtDescParent) # articleDescriptions.append(curArtDescChilds) articleDescriptions.append(extractArticleBody(articleTree)) # images curArtPubImage = parsers_common.treeExtract( articleTree, '//div[@id="content"]/div[@class="full_width"]/a/img[@class="thumb"]/@src' ) articleImages.append(curArtPubImage) # timeformat magic from "13/12/2017 22:24:59" to to datetime() curArtPubDate = parsers_common.treeExtract( articleTree, '//div[@id="content"]/div[@class="full_width"]/p[*]/i/b[2]/text()' ) curArtPubDate = parsers_common.rawToDatetime( curArtPubDate, "%d/%m/%Y %H:%M:%S") articlePubDates.append(curArtPubDate) articleImages = parsers_common.domainUrls(domain, articleImages) return { "articleDescriptions": articleDescriptions, "articleIds": articleIds, "articleImages": articleImages, "articlePubDates": articlePubDates, "articleTitles": articleTitles, "articleUrls": articleUrls, }
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod uudistesaidi kõigi uudiste nimekirja loomiseks """ articleDescriptions = [] articleIds = [] articleImages = [] articlePubDates = [] articleTitles = pageTree.xpath('//ul[@class="list search-items-list"]/li/span/a/text()') articleUrls = pageTree.xpath('//ul[@class="list search-items-list"]/li/span/a/@href') articlePubDatesRaw = pageTree.xpath('//ul[@class="list search-items-list"]/li/div/span[2]/text()') get_article_bodies = True for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # get unique id from ArticleUrl articleIds.append(articleUrl.split('/')[-2]) if (get_article_bodies is True and i < maxPageURLstoVisit): # load article into tree articleTree = makereq.getArticleData(articleUrl) # description curArtDescParent1 = parsers_common.treeExtract(articleTree, '//main/article/div[@class="flex flex--align-items-stretch"]//section') # as a parent curArtDescParent2 = parsers_common.treeExtract(articleTree, '//main/div[@class="wrap"]/div[@class="flex flex--align-items-stretch"]//section') # as a parent curArtDescChilds1 = parsers_common.stringify_children(curArtDescParent1) curArtDescChilds2 = parsers_common.stringify_children(curArtDescParent2) articleDescriptions.append(curArtDescChilds1 + ' ' + curArtDescChilds2) # image curArtImg = parsers_common.treeExtract(articleTree, '//main/article/div[@class="flex flex--align-items-stretch"]//figure/img[1]/@src') or "//" curArtImg = "http:" + curArtImg articleImages.append(curArtImg) # timeformat magic from "24. detsember 2017 17:51" to datetime() curArtPubDate = articlePubDatesRaw[i] curArtPubDate = parsers_common.longMonthsToNumber(curArtPubDate) curArtPubDate = parsers_common.rawToDatetime(curArtPubDate, "%d. %m %Y %H:%M") articlePubDates.append(curArtPubDate) return {"articleDescriptions": articleDescriptions, "articleIds": articleIds, "articleImages": articleImages, "articlePubDates": articlePubDates, "articleTitles": articleTitles, "articleUrls": articleUrls, }
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod uudistesaidi kõigi uudiste nimekirja loomiseks """ articleDescriptions = pageTree.xpath( '//div[@id="nsp-nsp-234"]//div[@class="nspArt nspCol1"]/div[@class="gkArtContentWrap"]/p[1]/text()' ) articleIds = [] articleImages = pageTree.xpath( '//div[@id="nsp-nsp-234"]//div[@class="nspArt nspCol1"]/a/img/@src') articlePubDates = [] articleTitles = pageTree.xpath( '//div[@id="nsp-nsp-234"]//div[@class="nspArt nspCol1"]/div[@class="gkArtContentWrap"]/h4/a/text()' ) articleUrls = pageTree.xpath( '//div[@id="nsp-nsp-234"]//div[@class="nspArt nspCol1"]/div[@class="gkArtContentWrap"]/h4/a/@href' ) articleUrls = parsers_common.domainUrls(domain, articleUrls) # todo(reading times from articles is BROKEN and maybe useless too) get_article_bodies = False for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # generate unique id from articleUrl articleIds.append(parsers_common.urlToHash(articleUrl)) if (get_article_bodies is True and i < maxPageURLstoVisit): # load article into tree articleTree = makereq.getArticleData(articleUrl) # timeformat magic from "Avaldatud: Neljapäev, 14 Detsember 2017 12:46" to datetime() # curArtPubDate = parsers_common.treeExtract(articleTree, '//div[@class="kakk-postheadericons kakk-metadata-icons"]/span/::before/text()') # katki curArtPubDate = parsers_common.treeExtract( articleTree, '//span[@class="kakk-postdateicon"]//text()') curArtPubDate = parsers_common.longMonthsToNumber( curArtPubDate.split(',')[1]) curArtPubDate = parsers_common.rawToDatetime( curArtPubDate, "%d %m %Y %H:%M") articlePubDates.append(curArtPubDate) return { "articleDescriptions": articleDescriptions, "articleIds": articleIds, "articleImages": articleImages, "articlePubDates": articlePubDates, "articleTitles": articleTitles, "articleUrls": articleUrls, }
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod uudistesaidi kõigi uudiste nimekirja loomiseks """ articleDescriptions = [] articleIds = [] articleImages = pageTree.xpath( '//div[@class="col-sm-6"]/div[@class="post-item"]/a/div/img/@src') articlePubDates = [] articleTitles = pageTree.xpath( '//div[@class="col-sm-6"]/div[@class="post-item"]/a/h3/text()') articleUrls = pageTree.xpath( '//div[@class="col-sm-6"]/div[@class="post-item"]/a/@href') get_article_bodies = False for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # generate unique id from ArticleUrl articleIds.append(parsers_common.urlToHash(articleUrl)) if (get_article_bodies is True and i < maxPageURLstoVisit): # load article into tree articleTree = makereq.getArticleData(articleUrl) # get first paragraph as header curArtDesc = parsers_common.treeExtract( articleTree, '//div[@class="col-sm-9"]/p[1]/strong/text()') articleDescriptions.append(curArtDesc) # timeformat magic from "Avaldatud: 14 detsember, 2017" to datetime() curArtPubDate = parsers_common.treeExtract( articleTree, '//div[@class="col-sm-9"]/div[@class="page-header"]/em/text()') curArtPubDate = parsers_common.longMonthsToNumber( curArtPubDate.split(':')[1]) curArtPubDate = parsers_common.rawToDatetime( curArtPubDate, "%d %m, %Y") articlePubDates.append(curArtPubDate) return { "articleDescriptions": articleDescriptions, "articleIds": articleIds, "articleImages": articleImages, "articlePubDates": articlePubDates, "articleTitles": articleTitles, "articleUrls": articleUrls, }
RSStoGenerate = range(0, len(RSSdefinitions)) # generate all feeds for curRSS in RSStoGenerate: curName = RSSdefinitions[curRSS][0] curTitle = RSSdefinitions[curRSS][1] curDescription = RSSdefinitions[curRSS][2] curDomain = curDomainRSS = RSSdefinitions[curRSS][3].rstrip('/') + '/' if len(RSSdefinitions[curRSS]) > 3 and len(RSSdefinitions[curRSS][4]) > 0: curDomainRSS = RSSdefinitions[curRSS][4] curParser = sys.modules['parser_' + curName] curFilename = curName + '.rss' try: # load page into tree pageTree = makereq.getArticleData(curDomainRSS) except Exception: print('rss_generaator: Viga! Ei suutnud andmeid pärida leheküljelt: ' + curDomainRSS) continue dataset = curParser.getArticleListsFromHtml(pageTree, curDomain, maxArticleURLstoVisit) rss = rssmaker.rssmaker(dataset, curTitle, curDomain, curDomainRSS, curDescription) try: rss.write(open(curFilename, 'wb'), encoding='UTF-8', pretty_print=True) print('rss_generaator: Fail ' + curFilename + ' salvestatud.') except Exception: print('rss_generaator: Viga! Ei õnnestunud faili ' + curFilename + ' salvestada.')
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod uudistesaidi kõigi uudiste nimekirja loomiseks """ articleDescriptions = pageTree.xpath( '//div[@class="news-list-item-wrapper"]/div[@class="news-list-item-excerpt"]/p/text()' ) articleIds = [] articleImages = pageTree.xpath('//div[@class="news-list-media"]/img/@src') articleImages = parsers_common.domainUrls(domain, articleImages) articlePubDates = [] articleTitles = pageTree.xpath( '//div[@class="news-list-item-wrapper"]/h3/a/text()') articleUrls = pageTree.xpath( '//div[@class="news-list-item-wrapper"]/h3/a/@href') articleUrls = parsers_common.domainUrls(domain, articleUrls) articlePubDay = pageTree.xpath( '//div[@class="news-list-item-wrapper"]/div[@class="news-list-item-date"]/text()[1]' ) articlePubMonth = pageTree.xpath( '//div[@class="news-list-item-wrapper"]/div[@class="news-list-item-date"]/span[@class="month"]/text()' ) articlePubYear = pageTree.xpath( '//div[@class="news-list-item-wrapper"]/div[@class="news-list-item-date"]/text()[2]' ) get_article_bodies = True for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # generate unical id from ArticleUrl articleIds.append(parsers_common.urlToHash(articleUrl)) if (get_article_bodies is True and i < maxPageURLstoVisit): # load article into tree articleTree = makereq.getArticleData(articleUrl) # descriptions curArtDescParent = parsers_common.treeExtract( articleTree, '//div[@class="news-single-item"]/div[@class="news-single-content"]' ) # as a parent curArtDescChilds = parsers_common.stringify_children( curArtDescParent) articleDescriptions[i] = curArtDescChilds # timeformat magic from "13 dets 17" to datetime() curArtPubDate = parsers_common.treeExtract( articleTree, '//div[@class="news-single-timedata"]/text()') curArtPubDate = parsers_common.shortMonthsToNumber(curArtPubDate) curArtPubDate = parsers_common.rawToDatetime( curArtPubDate, "%d %m %y") articlePubDates.append(curArtPubDate) else: if i < len(articlePubYear) and (int(articlePubYear[i].strip()) > 2016): curYear = articlePubYear[i].strip() curArtPubDate = articlePubDay[i].strip( ) + " " + articlePubMonth[i].strip() + " " + curYear curArtPubDate = parsers_common.rawToDatetime( curArtPubDate, "%d %m %Y") articlePubDates.append(curArtPubDate) return { "articleDescriptions": articleDescriptions, "articleImages": articleImages, "articleIds": articleIds, "articlePubDates": articlePubDates, "articleTitles": articleTitles, "articleUrls": articleUrls, }