def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod uudistesaidi kõigi uudiste nimekirja loomiseks """ articleDescriptions = [] articleIds = [] articleImages = [] articlePubDates = [] articleTitles = pageTree.xpath('//div[@class="forum"][2]/ul/li/a/text()') articleUrls = pageTree.xpath('//div[@class="forum"][2]/ul/li/a/@href') articleUrls = parsers_common.domainUrls(domain, articleUrls) get_article_bodies = True for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # get unique id from articleUrl articleIds.append( articleUrl[articleUrl.index('&id=') + 4:articleUrl.index('&', articleUrl.index('&id=') + 4)]) if (get_article_bodies is True and i < maxPageURLstoVisit): # load article into tree articleTree = makereq.getArticleData(articleUrl) # descriptions # curArtDescParent = parsers_common.treeExtract(articleTree, '//div[@id="content"]/div[@class="full_width"]/p[1]') # as a parent # curArtDescChilds = parsers_common.stringify_children(curArtDescParent) # articleDescriptions.append(curArtDescChilds) articleDescriptions.append(extractArticleBody(articleTree)) # images curArtPubImage = parsers_common.treeExtract( articleTree, '//div[@id="content"]/div[@class="full_width"]/a/img[@class="thumb"]/@src' ) articleImages.append(curArtPubImage) # timeformat magic from "13/12/2017 22:24:59" to to datetime() curArtPubDate = parsers_common.treeExtract( articleTree, '//div[@id="content"]/div[@class="full_width"]/p[*]/i/b[2]/text()' ) curArtPubDate = parsers_common.rawToDatetime( curArtPubDate, "%d/%m/%Y %H:%M:%S") articlePubDates.append(curArtPubDate) articleImages = parsers_common.domainUrls(domain, articleImages) return { "articleDescriptions": articleDescriptions, "articleIds": articleIds, "articleImages": articleImages, "articlePubDates": articlePubDates, "articleTitles": articleTitles, "articleUrls": articleUrls, }
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod Tartu aarete nimekirja loomiseks """ articleDescriptions = [] articleIds = [] # articleImages = [] articlePubDates = pageTree.xpath( '//div[@id="t-content"]/table[1]/tr/td[1]/text()') articleTitles = pageTree.xpath( '//div[@id="t-content"]/table[1]/tr/td[@class="left"]/b/a/text()') articleUrls = pageTree.xpath( '//div[@id="t-content"]/table[1]/tr/td[@class="left"]/b/a/@href') articleUrls = parsers_common.domainUrls(domain, articleUrls) articleDescriptionsParents = pageTree.xpath( '//div[@id="t-content"]/table[1]/tr') # as a parent for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # get unique id from articleUrl articleIds.append(articleUrl.split('/')[-1]) # descriptions curArtDescParent = articleDescriptionsParents[i] curArtDescChilds = parsers_common.stringify_children(curArtDescParent) articleDescriptions.append(curArtDescChilds) # timeformat magic from "12.12.2017" to datetime() curArtPubDate = articlePubDates[i] curArtPubDate = parsers_common.rawToDatetime(curArtPubDate, "%d.%m.%Y") articlePubDates[i] = curArtPubDate # remove non "Tartu" ocation lines retArticleDescriptions = [] retArticleIds = [] retArticleImages = [] retArticlePubDates = [] retArticleTitles = [] retArticleUrls = [] for i in range(0, len(articleUrls)): if ('Tartu' in articleDescriptions[i]): retArticleDescriptions.append(articleDescriptions[i]) retArticleIds.append(articleIds[i]) # retArticleImages.append(articleImages[i]) retArticlePubDates.append(articlePubDates[i]) retArticleTitles.append(articleTitles[i]) retArticleUrls.append(articleUrls[i]) return { "articleDescriptions": retArticleDescriptions, "articleIds": retArticleIds, "articleImages": retArticleImages, "articlePubDates": retArticlePubDates, "articleTitles": retArticleTitles, "articleUrls": retArticleUrls, }
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod uudistesaidi kõigi uudiste nimekirja loomiseks """ articleDescriptions = [] articleIds = [] articleImages = [] articlePubDates = [] articleTitles = pageTree.xpath('//ul[@class="list search-items-list"]/li/span/a/text()') articleUrls = pageTree.xpath('//ul[@class="list search-items-list"]/li/span/a/@href') articlePubDatesRaw = pageTree.xpath('//ul[@class="list search-items-list"]/li/div/span[2]/text()') get_article_bodies = True for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # get unique id from ArticleUrl articleIds.append(articleUrl.split('/')[-2]) if (get_article_bodies is True and i < maxPageURLstoVisit): # load article into tree articleTree = makereq.getArticleData(articleUrl) # description curArtDescParent1 = parsers_common.treeExtract(articleTree, '//main/article/div[@class="flex flex--align-items-stretch"]//section') # as a parent curArtDescParent2 = parsers_common.treeExtract(articleTree, '//main/div[@class="wrap"]/div[@class="flex flex--align-items-stretch"]//section') # as a parent curArtDescChilds1 = parsers_common.stringify_children(curArtDescParent1) curArtDescChilds2 = parsers_common.stringify_children(curArtDescParent2) articleDescriptions.append(curArtDescChilds1 + ' ' + curArtDescChilds2) # image curArtImg = parsers_common.treeExtract(articleTree, '//main/article/div[@class="flex flex--align-items-stretch"]//figure/img[1]/@src') or "//" curArtImg = "http:" + curArtImg articleImages.append(curArtImg) # timeformat magic from "24. detsember 2017 17:51" to datetime() curArtPubDate = articlePubDatesRaw[i] curArtPubDate = parsers_common.longMonthsToNumber(curArtPubDate) curArtPubDate = parsers_common.rawToDatetime(curArtPubDate, "%d. %m %Y %H:%M") articlePubDates.append(curArtPubDate) return {"articleDescriptions": articleDescriptions, "articleIds": articleIds, "articleImages": articleImages, "articlePubDates": articlePubDates, "articleTitles": articleTitles, "articleUrls": articleUrls, }
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod uudistesaidi kõigi uudiste nimekirja loomiseks """ articleDescriptions = pageTree.xpath( '//div[@id="nsp-nsp-234"]//div[@class="nspArt nspCol1"]/div[@class="gkArtContentWrap"]/p[1]/text()' ) articleIds = [] articleImages = pageTree.xpath( '//div[@id="nsp-nsp-234"]//div[@class="nspArt nspCol1"]/a/img/@src') articlePubDates = [] articleTitles = pageTree.xpath( '//div[@id="nsp-nsp-234"]//div[@class="nspArt nspCol1"]/div[@class="gkArtContentWrap"]/h4/a/text()' ) articleUrls = pageTree.xpath( '//div[@id="nsp-nsp-234"]//div[@class="nspArt nspCol1"]/div[@class="gkArtContentWrap"]/h4/a/@href' ) articleUrls = parsers_common.domainUrls(domain, articleUrls) # todo(reading times from articles is BROKEN and maybe useless too) get_article_bodies = False for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # generate unique id from articleUrl articleIds.append(parsers_common.urlToHash(articleUrl)) if (get_article_bodies is True and i < maxPageURLstoVisit): # load article into tree articleTree = makereq.getArticleData(articleUrl) # timeformat magic from "Avaldatud: Neljapäev, 14 Detsember 2017 12:46" to datetime() # curArtPubDate = parsers_common.treeExtract(articleTree, '//div[@class="kakk-postheadericons kakk-metadata-icons"]/span/::before/text()') # katki curArtPubDate = parsers_common.treeExtract( articleTree, '//span[@class="kakk-postdateicon"]//text()') curArtPubDate = parsers_common.longMonthsToNumber( curArtPubDate.split(',')[1]) curArtPubDate = parsers_common.rawToDatetime( curArtPubDate, "%d %m %Y %H:%M") articlePubDates.append(curArtPubDate) return { "articleDescriptions": articleDescriptions, "articleIds": articleIds, "articleImages": articleImages, "articlePubDates": articlePubDates, "articleTitles": articleTitles, "articleUrls": articleUrls, }
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod uudistesaidi kõigi uudiste nimekirja loomiseks """ articleDescriptions = [] articleIds = [] articleImages = pageTree.xpath( '//div[@class="col-sm-6"]/div[@class="post-item"]/a/div/img/@src') articlePubDates = [] articleTitles = pageTree.xpath( '//div[@class="col-sm-6"]/div[@class="post-item"]/a/h3/text()') articleUrls = pageTree.xpath( '//div[@class="col-sm-6"]/div[@class="post-item"]/a/@href') get_article_bodies = False for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # generate unique id from ArticleUrl articleIds.append(parsers_common.urlToHash(articleUrl)) if (get_article_bodies is True and i < maxPageURLstoVisit): # load article into tree articleTree = makereq.getArticleData(articleUrl) # get first paragraph as header curArtDesc = parsers_common.treeExtract( articleTree, '//div[@class="col-sm-9"]/p[1]/strong/text()') articleDescriptions.append(curArtDesc) # timeformat magic from "Avaldatud: 14 detsember, 2017" to datetime() curArtPubDate = parsers_common.treeExtract( articleTree, '//div[@class="col-sm-9"]/div[@class="page-header"]/em/text()') curArtPubDate = parsers_common.longMonthsToNumber( curArtPubDate.split(':')[1]) curArtPubDate = parsers_common.rawToDatetime( curArtPubDate, "%d %m, %Y") articlePubDates.append(curArtPubDate) return { "articleDescriptions": articleDescriptions, "articleIds": articleIds, "articleImages": articleImages, "articlePubDates": articlePubDates, "articleTitles": articleTitles, "articleUrls": articleUrls, }
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod saidi pakkumiste nimekirja loomiseks """ articleDescriptions = [] articleIds = [] articleImages = [] articlePubDates = [] articleTitles = pageTree.xpath( '//table[@class="footable"]/tbody/tr/td[1]/a/strong/text()') articleUrls = pageTree.xpath( '//table[@class="footable"]/tbody/tr/td[1]/a/@href') articleDescName = pageTree.xpath( '//table[@class="footable"]/tbody/tr/td[1]/text()[2]') articleDescLoc = pageTree.xpath( '//table[@class="footable"]/tbody/tr/td[4]/text()') articlePubDatesRaw = pageTree.xpath( '//table[@class="footable"]/tbody/tr/td[2]/text()') for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # get unique id from articleUrl articleIds.append(articleUrl.split('-')[-1]) # descriptions articleDescriptions.append( parsers_common.toPlaintext(articleDescName[i]) + "<br>" + parsers_common.toPlaintext(articleDescLoc[i])) # timeformat magic from "12.12.2017" to datetime() curArtPubDate = articlePubDatesRaw[i] curArtPubDate = parsers_common.rawToDatetime(curArtPubDate, "%d.%m.%Y") articlePubDates.append(curArtPubDate) # titles articleTitles[i] = parsers_common.toPlaintext( articleTitles[i]).capitalize() return { "articleDescriptions": articleDescriptions, "articleIds": articleIds, "articleImages": articleImages, "articlePubDates": articlePubDates, "articleTitles": articleTitles, "articleUrls": articleUrls, }
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod uudistesaidi kõigi uudiste nimekirja loomiseks """ articleDescriptions = [] articleIds = [] articleImages = [] articlePubDates = [] articleTitles = pageTree.xpath( '//div[@class="audiolist_item"]/div[@class="audiolist_item_header"]/div[@class="audiolist_item_label"]/text()' ) articleUrls = pageTree.xpath( '//div[@class="audiolist_item"]/div[@class="audiolist_item_header"]/a/@href' ) articleDescriptionsParents = pageTree.xpath( '//div[@class="audiolist_item"]/div[@class="audiolist_item_bottom"]/div[@class="audioitem_item_desc"]' ) # as a parent articlePubDatesRaw = pageTree.xpath( '//div[@class="audiolist_item"]/div[@class="audiolist_item_header"]/div[@class="audiolist_item_label"]/text()' ) for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # generate unique id from articleUrl articleIds.append(parsers_common.urlToHash(articleUrl)) # descriptions curArtDesc = articleDescriptionsParents[i] curArtDesc = parsers_common.stringify_children(curArtDesc) articleDescriptions.append(curArtDesc) # timeformat magic from "15.12.2017 - L" to datetime() curArtPubDate = articlePubDatesRaw[i].split('-')[0] curArtPubDate = parsers_common.shortMonthsToNumber(curArtPubDate) curArtPubDate = parsers_common.rawToDatetime(curArtPubDate, "%d.%m.%Y") articlePubDates.append(curArtPubDate) return { "articleDescriptions": articleDescriptions, "articleIds": articleIds, "articleImages": articleImages, "articlePubDates": articlePubDates, "articleTitles": articleTitles, "articleUrls": articleUrls, }
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod uudistesaidi kõigi uudiste nimekirja loomiseks """ articleDescriptions = pageTree.xpath('//div[@class="js-newsline-container"]/div/a/text()') articleIds = [] articleImages = [] articlePubDates = [] articleTitles = pageTree.xpath('//div[@class="js-newsline-container"]/div/a/text()') articleUrls = pageTree.xpath('//div[@class="js-newsline-container"]/div/a/@href') articlePubDatesRaw = pageTree.xpath('//div[@class="js-newsline-container"]/span[1]/text()') articleDescriptionsTag = pageTree.xpath('//div[@class="js-newsline-container"]/div/span[1]/text()') for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # get unical id from articleUrl articleIds.append(articleUrl.split('/')[-2]) # description articleDescriptions[i] = articleDescriptionsTag[i] + "<br>" + articleDescriptions[i] # timeformat magic from "14 dets 2017 11:34" to datetime() curArtPubDate = articlePubDatesRaw[i] curArtPubDate = parsers_common.shortMonthsToNumber(curArtPubDate) curArtPubDate = parsers_common.rawToDatetime(curArtPubDate, "%d %m %Y %H:%M") articlePubDates.append(curArtPubDate) return {"articleDescriptions": articleDescriptions, "articleIds": articleIds, "articleImages": articleImages, "articlePubDates": articlePubDates, "articleTitles": articleTitles, "articleUrls": articleUrls, }
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod uudistesaidi kõigi uudiste nimekirja loomiseks """ articleDescriptions = pageTree.xpath('//div[@class="midColPost"]/p/text()') articleIds = [] articleImages = pageTree.xpath('//div[@class="midColPost"]/a/img/@src') articlePubDates = [] articleTitles = pageTree.xpath('//div[@class="midColPost"]/h2/a/@title') articleUrls = pageTree.xpath('//div[@class="midColPost"]/h2/a/@href') articlePubDatesRaw = pageTree.xpath( '//div[@class="midColPost"]/span/text()[1]') for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # get unique id from articleUrl articleIds.append(articleUrl.split("?p=")[1]) # timeformat magic from "15. detsember 2017 / " to datetime() curArtPubDate = articlePubDatesRaw[i] curArtPubDate = parsers_common.longMonthsToNumber(curArtPubDate) curArtPubDate = parsers_common.rawToDatetime(curArtPubDate, "%d. %m %Y /") articlePubDates.append(curArtPubDate) return { "articleDescriptions": articleDescriptions, "articleIds": articleIds, "articleImages": articleImages, "articlePubDates": articlePubDates, "articleTitles": articleTitles, "articleUrls": articleUrls, }
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit): """ Meetod uudistesaidi kõigi uudiste nimekirja loomiseks """ articleDescriptions = pageTree.xpath( '//div[@class="news-list-item-wrapper"]/div[@class="news-list-item-excerpt"]/p/text()' ) articleIds = [] articleImages = pageTree.xpath('//div[@class="news-list-media"]/img/@src') articleImages = parsers_common.domainUrls(domain, articleImages) articlePubDates = [] articleTitles = pageTree.xpath( '//div[@class="news-list-item-wrapper"]/h3/a/text()') articleUrls = pageTree.xpath( '//div[@class="news-list-item-wrapper"]/h3/a/@href') articleUrls = parsers_common.domainUrls(domain, articleUrls) articlePubDay = pageTree.xpath( '//div[@class="news-list-item-wrapper"]/div[@class="news-list-item-date"]/text()[1]' ) articlePubMonth = pageTree.xpath( '//div[@class="news-list-item-wrapper"]/div[@class="news-list-item-date"]/span[@class="month"]/text()' ) articlePubYear = pageTree.xpath( '//div[@class="news-list-item-wrapper"]/div[@class="news-list-item-date"]/text()[2]' ) get_article_bodies = True for i in range(0, len(articleUrls)): articleUrl = articleUrls[i] # generate unical id from ArticleUrl articleIds.append(parsers_common.urlToHash(articleUrl)) if (get_article_bodies is True and i < maxPageURLstoVisit): # load article into tree articleTree = makereq.getArticleData(articleUrl) # descriptions curArtDescParent = parsers_common.treeExtract( articleTree, '//div[@class="news-single-item"]/div[@class="news-single-content"]' ) # as a parent curArtDescChilds = parsers_common.stringify_children( curArtDescParent) articleDescriptions[i] = curArtDescChilds # timeformat magic from "13 dets 17" to datetime() curArtPubDate = parsers_common.treeExtract( articleTree, '//div[@class="news-single-timedata"]/text()') curArtPubDate = parsers_common.shortMonthsToNumber(curArtPubDate) curArtPubDate = parsers_common.rawToDatetime( curArtPubDate, "%d %m %y") articlePubDates.append(curArtPubDate) else: if i < len(articlePubYear) and (int(articlePubYear[i].strip()) > 2016): curYear = articlePubYear[i].strip() curArtPubDate = articlePubDay[i].strip( ) + " " + articlePubMonth[i].strip() + " " + curYear curArtPubDate = parsers_common.rawToDatetime( curArtPubDate, "%d %m %Y") articlePubDates.append(curArtPubDate) return { "articleDescriptions": articleDescriptions, "articleImages": articleImages, "articleIds": articleIds, "articlePubDates": articlePubDates, "articleTitles": articleTitles, "articleUrls": articleUrls, }