def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//div[@id="t-content"]/table[1]/tr', parent=True) articleDataDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//div[@id="t-content"]/table[1]/tr/td[1]/text()') articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@id="t-content"]/table[1]/tr/td[@class="left"]/b/a/text()') articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@id="t-content"]/table[1]/tr/td[@class="left"]/b/a/@href') # remove unwanted content: descriptions dictList = ["Tartu"] articleDataDict = parsers_common.article_data_dict_clean( articleDataDict, dictList, "not in", "descriptions") for i in parsers_common.article_urls_range(articleDataDict["urls"]): # pubDates magic from "29.08.19" to datetime() curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d.%m.%y") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="news-list list"]/li/div[@class="inner"]/a[@class="news-list-link"]/div/p', parent=True) articleDataDict["images"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="news-list list"]/li/div[@class="inner"]/a[@class="news-list-link"]/p/img/@src' ) articleDataDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="news-list list"]/li/div[@class="inner"]/a[@class="news-list-link"]/@title' ) articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="news-list list"]/li/div[@class="inner"]/a[@class="news-list-link"]/div/h3/text()' ) articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="news-list list"]/li/div[@class="inner"]/a[@class="news-list-link"]/@href' ) for i in parsers_common.article_urls_range(articleDataDict["urls"]): # pubDates magic from "03.01.2018 11:09.08 [Tanel]" to datetime() curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = curArtPubDate.split('[')[0] curArtPubDate = parsers_datetime.months_to_int(curArtPubDate) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d. %m %Y %H:%M:%S") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["authors"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="message"]/div[@class="name"]', parent=True) articleDataDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="message"]/div[@class="content"]', parent=True) articleDataDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="message"]/div[@class="posttime"]/text()') articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="message"]/div[@class="title"]/a[3]/text()') articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="message"]/div[@class="title"]/a[3]/@href') # remove unwanted content: titles dictList = [ "Hoiatus! Läbisõit 100% keritud", "Kaebused", ] articleDataDict = parsers_common.article_data_dict_clean( articleDataDict, dictList, "in", "titles") for i in parsers_common.article_urls_range(articleDataDict["urls"]): # description curArtDesc = parsers_common.get(articleDataDict["descriptions"], i) curArtDesc = curArtDesc.split('<div class="userControls')[0] articleDataDict["descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) # title curArtTitle = parsers_common.get(articleDataDict["titles"], i) curArtTitle = parsers_common.str_lchop(curArtTitle, "Re: ") curArtTitle = parsers_common.str_title_at_domain(curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add_or_assign( articleDataDict["titles"], i, curArtTitle) # pubDates magic from "20:22 01.09.2019" to Datetime() curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = parsers_datetime.replace_string_with_timeformat( curArtPubDate, "eile", "%d.%m.%Y", offsetDays=-1) curArtPubDate = parsers_datetime.replace_string_with_timeformat( curArtPubDate, "täna", "%d.%m.%Y", offsetDays=0) curArtPubDate = parsers_datetime.add_missing_date_to_string( curArtPubDate, "%H:%M %d.%m.%Y", " %d.%m.%Y") curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%H:%M %d.%m.%Y") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//body/div[2]/div[1]/main/div/div[3]/div/div/a/h2/text()') articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//body/div[2]/div[1]/main/div/div[3]/div/div/a/@href') for i in parsers_common.article_urls_range(articleDataDict["urls"]): if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # description curArtDesc = parsers_common.xpath_to("single", pageTree, '//div[@class="col-md-12"]', parent=True) if not curArtDesc: curArtDesc = parsers_common.xpath_to( "single", pageTree, '//div[@class="col-md-8"]', parent=True) if not curArtDesc: curArtDesc = parsers_common.xpath_to( "single", pageTree, '//div[@class="img-open-area basic-content pb-4"]', parent=True) articleDataDict[ "descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) # image curArtImg = parsers_common.xpath_to( "single", pageTree, '//a[@data-lightbox="treimages"][1]/@href') articleDataDict["images"] = parsers_common.list_add_or_assign( articleDataDict["images"], i, curArtImg) # pubDates magic from "7. märts 2021" to datetime() curArtPubDate = parsers_common.xpath_to( "single", pageTree, '//div[@class="col-sm-12 col-md-auto text-sm-center text-md-right"]/div[@class="mt-1 text-uppercase"]/small/text()' ) curArtPubDate = parsers_datetime.months_to_int(curArtPubDate) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d. %m %Y") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '/html/body/div[3]/div/div[1]/div[@class="sb-article"]/a/div[@class="sb-article-cnt"]/div[@class="sb-article-prolog"]', parent=True) articleDataDict["images"] = parsers_common.xpath_to( "list", pageTree, '/html/body/div[3]/div/div[1]/div[@class="sb-article"]/a/div[@class="sb-article-image"]/@style' ) articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '/html/body/div[3]/div/div[1]/div[@class="sb-article"]/a/div[@class="sb-article-cnt"]/div[@class="sb-article-title"]/h3/text()' ) articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '/html/body/div[3]/div/div[1]/div[@class="sb-article"]/a/@href') for i in parsers_common.article_urls_range(articleDataDict["urls"]): if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # author curArtAuthor = parsers_common.xpath_to( "single", pageTree, '//div[@class="sg-article-details"]/div[@class="author"]/text()' ) articleDataDict["authors"] = parsers_common.list_add_or_assign( articleDataDict["authors"], i, curArtAuthor) # description curArtDesc = parsers_common.xpath_to( "single", pageTree, '/html/body/div[3]/div/div[@class="page-content"]/div[@class="sg-article"]/div[@class="sg-article-text"]', parent=True) articleDataDict[ "descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) # pubDates magic from "18.08.2019 21:35" to datetime() curArtPubDate = parsers_common.xpath_to( "single", pageTree, '//div[@class="sg-article-details"]/div[@class="date"]/text()') curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d.%m.%Y %H:%M") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="applied-jobs"]/div/div[@class="job-content"]/div[@class="left-content"]/p', parent=True) articleDataDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="applied-jobs"]/div/div[@class="job-content"]/div[@class="right-content"]/div[@class="application-date"][1]/text()' ) articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="applied-jobs"]/div/div[@class="job-content"]/div[@class="left-content"]/h2/a/text()' ) articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="applied-jobs"]/div/div[@class="job-content"]/div[@class="left-content"]/h2/a/@href' ) # remove unwanted content: descriptions dictList = [ "jurist", "logopeed", "pedagoog", "psühholoog", "raamatupidaja", "sotsiaal", "õpetaja", ] articleDataDict = parsers_common.article_data_dict_clean( articleDataDict, dictList, "in", "titles") for i in parsers_common.article_urls_range(articleDataDict["urls"]): # pubDates magic from "Avaldatud: 12.12.2017" to datetime() curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = curArtPubDate.split(': ')[1] curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d.%m.%Y") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) # url curArtUrl = parsers_common.get(articleDataDict["urls"], i) curArtUrl = curArtUrl.split('?ref=')[0] articleDataDict["urls"] = parsers_common.list_add_or_assign( articleDataDict["urls"], i, curArtUrl) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["authors"] = parsers_common.xpath_to( "list", pageTree, '//section/article/h2/a[2]/text()') articleDataDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//section/article/time/text()') articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//section/article/h2/a[@itemprop="url"]/text()') articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//section/article/h2/a[@itemprop="url"]/@href') for i in parsers_common.article_urls_range(articleDataDict["urls"]): # pubDates magic from "2021-01-06T10:11:04Z" to datetime() curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%Y-%m-%dt%H:%M:%S%z") articleDataDict["pubDates"][i] = curArtPubDate if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache="cacheAll") # description curArtDesc = parsers_common.xpath_to("single", pageTree, '//body/div//article', parent=True) articleDataDict[ "descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) # image curArtImg = parsers_common.xpath_to( "single", pageTree, '//body/div//article/p/img/@src') articleDataDict["images"] = parsers_common.list_add_or_assign( articleDataDict["images"], i, curArtImg) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["images"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="col-sm-6"]/div[@class="post-item"]/a/div/img/@src') articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="col-sm-6"]/div[@class="post-item"]/a/h3/text()') articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="col-sm-6"]/div[@class="post-item"]/a/@href') for i in parsers_common.article_urls_range(articleDataDict["urls"]): if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # descriptions curArtDesc = parsers_common.xpath_to("single", pageTree, '//div[@class="col-sm-9"]/p', multi=True) articleDataDict[ "descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) # timeformat magic from "Avaldatud: 14 detsember, 2017" to datetime() curArtPubDate = parsers_common.xpath_to( "single", pageTree, '//div[@class="col-sm-9"]/div[@class="page-header"]/em/text()') curArtPubDate = parsers_datetime.months_to_int( curArtPubDate.split(':')[1]) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d %m, %Y") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="js-newsline-container"]/span[1]/text()') articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="js-newsline-container"]/div/a/text()') articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="js-newsline-container"]/div/a/@href') for i in parsers_common.article_urls_range(articleDataDict["urls"]): # pubDates magic from "14 dets 2017 11:34" to datetime() curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = parsers_datetime.months_to_int(curArtPubDate) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d %m %Y %H:%M") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # description curArtDesc = parsers_common.xpath_to( "single", pageTree, '//div[@class="news-preview"]/div/text()') if not curArtDesc: curArtDesc = parsers_common.xpath_to( "single", pageTree, '//div[@class="content_item"]', parent=True) articleDataDict[ "descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["images"] = parsers_common.xpath_to( "list", pageTree, '//article/a/div/div/img/@src') articleDataDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//article/a/div[@class="node__body"]/p[@class="node__date"]/span/@content' ) articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//article/a/div[@class="node__body"]/h3/span/text()') articleDataDict["urls"] = parsers_common.xpath_to("list", pageTree, '//article/a/@href') for i in parsers_common.article_urls_range(articleDataDict["urls"]): # pubDates magic from "2021-03-23T12:35:36+00:00" to datetime() curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%Y-%m-%dt%H:%M:%S%z") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # description curArtDesc = parsers_common.xpath_to( "single", pageTree, '//div[@class="node__content"]/div/div[@class="field__item"]', parent=True) articleDataDict[ "descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["authors"] = parsers_common.xpath_to( "list", pageTree, '//section/div/div/div/div/div/article/div/div/div/div[2]/div[2]/div[1]/div/div/div[1]/div[1]/a/div/div[1]/div[1]/span/span/text()' ) articleDataDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//section/div/div/div/div/div/article/div/div/div/div[2]/div[2]/div[2]/div[1]/div/span[1]', parent=True) articleDataDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//section/div/div/div/div/div/article/div/div/div/div[2]/div[2]/div[1]/div/div/div[1]/a/time/@datetime' ) articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//section/div/div/div/div/div/article/div/div/div/div[2]/div[2]/div[1]/div/div/div[1]/div[1]/a/div/div[1]/div[1]/span/span/text()' ) articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//section/div/div/div/div/div/article/div/div/div/div[2]/div[2]/div[1]/div/div/div[1]/a/@href' ) for i in parsers_common.article_urls_range(articleDataDict["urls"]): # pubDates 2021-02-12T16:08:02.000Z curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%Y-%m-%dT%H:%M:%S.000Z") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) # title curArtTitle = parsers_common.get(articleDataDict["titles"], i) curArtTitle = parsers_common.str_title_at_domain(curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add_or_assign( articleDataDict["titles"], i, curArtTitle) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//li[@class="b-posts__list-item"]/p[@class="b-posts__list-item-summary"]/text()') articleDataDict["titles"] = parsers_common.xpath_to("list", pageTree, '//li[@class="b-posts__list-item"]/h2[@class="b-posts__list-item-title"]/a/text()') articleDataDict["urls"] = parsers_common.xpath_to("list", pageTree, '//li[@class="b-posts__list-item"]/h2[@class="b-posts__list-item-title"]/a/@href') for i in parsers_common.article_urls_range(articleDataDict["urls"]): # pubDates magic from "30.01.2021" to datetime() curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = parsers_datetime.raw_to_datetime(curArtPubDate, "%d.%m.%Y") articleDataDict["pubDates"] = parsers_common.list_add_or_assign(articleDataDict["pubDates"], i, curArtPubDate) if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # description curArtDesc = parsers_common.xpath_to("single", pageTree, '//div[@class="b-article"]', parent=True) articleDataDict["descriptions"] = parsers_common.list_add_or_assign(articleDataDict["descriptions"], i, curArtDesc) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 10) maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX / maxArticleBodies) # set 0 for all posts parentPages = {} parentPages["stamps"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="reply-count"]/span[@data-xf-init="tooltip"]/text()') parentPages["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@qid="thread-item-parent"]/div[@qid="thread-item"]/div/div[@class="structItem-title"]/a[@qid="thread-item-title"]/text()' ) parentPages["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@qid="thread-item-parent"]/div[@qid="thread-item"]/div/div[@class="structItem-title"]/a[@qid="thread-item-title"]/@href' ) # remove unwanted content: titles dictList = [ "$", "*:::::::the official what did you do to you mkiv today thread::::::::*", "??", "Ask a Simple Question", ] parentPages = parsers_common.article_data_dict_clean( parentPages, dictList, "in", "titles") # teemade läbivaatamine for i in parsers_common.article_urls_range(parentPages["urls"]): # teemalehe sisu hankimine if parsers_common.should_get_article_body(i, maxArticleBodies): curParentUrl = parsers_common.get(parentPages["urls"], i) curParentUrl = curParentUrl + "page-1000" parentPagesStamp = parsers_common.get(parentPages["stamps"], i) # load article into tree pageTree = parsers_common.get_article_tree( domain, curParentUrl, cache='cacheStamped', pageStamp=parentPagesStamp) articlePostsDict = {} articlePostsDict["authors"] = parsers_common.xpath_to( "list", pageTree, '//h4[@qid="message-username"]//text()') articlePostsDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//article/div[@class="bbWrapper"]', parent=True) articlePostsDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="message-attribution-main"]/a[@class="u-concealed"][2]/time/@datetime' ) articlePostsDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="message-attribution-main"]/a[@class="u-concealed"][2]/@href' ) # teema postituste läbivaatamine for j in parsers_common.article_posts_range( articlePostsDict["urls"], maxArticlePosts): # author articleDataDict["authors"] = parsers_common.list_add( articleDataDict["authors"], j, parsers_common.get(articlePostsDict["authors"], j)) # description curArtDesc = parsers_common.get( articlePostsDict["descriptions"], j) articleDataDict["descriptions"] = parsers_common.list_add( articleDataDict["descriptions"], j, curArtDesc) # pubDates curArtPubDate = parsers_common.get( articlePostsDict["pubDates"], j) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%Y-%m-%dT%H:%M:%S%z") # 2021-01-28T16:15:42-0500 articleDataDict["pubDates"] = parsers_common.list_add( articleDataDict["pubDates"], j, curArtPubDate) # title curArtTitle = parsers_common.get(parentPages["titles"], i) curArtTitle = parsers_common.str_title_at_domain( curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add( articleDataDict["titles"], j, curArtTitle) # url curArtUrl = parsers_common.get(articlePostsDict["urls"], j) articleDataDict["urls"] = parsers_common.list_add( articleDataDict["urls"], j, curArtUrl) rss_print.print_debug( __file__, "teema " + str(i + 1) + " postitus nr. " + str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) + ") on " + articlePostsDict["urls"][j], 2) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="grid-main--item "]/div/div[2]/div[1]/h6/a[1]/text()') articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="grid-main--item "]/div/div[2]/div[1]/h6/a[1]/@href') for i in parsers_common.article_urls_range(articleDataDict["urls"]): # titles curArtTitle = parsers_common.get(articleDataDict["titles"], i) curArtTitle = parsers_common.str_remove_clickbait(curArtTitle) articleDataDict["titles"] = parsers_common.list_add_or_assign( articleDataDict["titles"], i, curArtTitle) if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # author curArtAuthor = parsers_common.xpath_to( "single", pageTree, '//span[@class="author"]/text()') articleDataDict["authors"] = parsers_common.list_add_or_assign( articleDataDict["authors"], i, curArtAuthor) # description curArtDesc1 = parsers_common.xpath_to( "single", pageTree, '//div[@class="page-layout--block"][1]/div[@class="page-layout--content"]/div[@class="page-layout--inner"]/div[@class="article-main--content article-main--excerpt formatted--content"]', parent=True) if not curArtDesc1: curArtDesc1 = parsers_common.xpath_to( "single", pageTree, '//div[@class="page-layout--content"]/div[@class="page-layout--inner"]/div[@class="article-main--content article-main--excerpt formatted--content"]', parent=True) curArtDesc2 = parsers_common.xpath_to( "single", pageTree, '//div[@class="page-layout--block"][2]/div[@class="page-layout--content"]/div[@class="page-layout--inner"]', parent=True, multi=True) curArtDesc = curArtDesc1 + curArtDesc2 curArtDesc = curArtDesc.split("Edasi lugemiseks")[0] curArtDesc = curArtDesc.split("Jaga:")[0] curArtDesc = curArtDesc.split("Samal teemal")[0] articleDataDict[ "descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) # image curArtImg = parsers_common.xpath_to( "single", pageTree, '//div[@class="page-layout--block"][1]//div[@class="image-gallery-image first-in-gallery"][1]/picture[1]/img[@class="article-image"]/@src' ) if not curArtImg: curArtImg = parsers_common.xpath_to( "single", pageTree, '//div[@class="part"][1]/div/p/img/@src') articleDataDict["images"] = parsers_common.list_add_or_assign( articleDataDict["images"], i, curArtImg) # pubDates from "täna 16:53" to datetime() curArtPubDate = parsers_common.xpath_to( "single", pageTree, '//div[@class="details--inner"]/text()') curArtPubDate = curArtPubDate.split(",")[-1] curArtPubDate = parsers_datetime.replace_string_with_timeformat( curArtPubDate, "eile", "%d. %m %Y ", offsetDays=-1) curArtPubDate = parsers_datetime.replace_string_with_timeformat( curArtPubDate, "täna", "%d. %m %Y ", offsetDays=0) curArtPubDate = parsers_datetime.months_to_int(curArtPubDate) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d. %m %Y %H:%M") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 5) maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX / maxArticleBodies) # set 0 for all posts parentPages = {} parentPages["stamps"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="grid zebra forum"]/tr/td[@class="meta"][4]/span/text()' ) parentPages["titles"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="grid zebra forum"]/tr/td[@class="title"]/a/@title') parentPages["urls"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="grid zebra forum"]/tr/td[@class="title"]/a/@href') # remove unwanted content: titles dictList = [ "Börsihai", "Cleveroni aktsiate ost/müük/oksjon", "Head uut aastat – prognoosid", "Keegi malet soovib mängida", "LHV Pank paremaks", "Uurimis- ja lõputööde küsimustikud", ] parentPages = parsers_common.article_data_dict_clean( parentPages, dictList, "in", "titles") # teemade läbivaatamine for i in parsers_common.article_urls_range(parentPages["urls"]): # teemalehe sisu hankimine if parsers_common.should_get_article_body(i, maxArticleBodies): # load article into tree pageTree = parsers_common.get_article_tree( domain, parentPages["urls"][i] + '?listEventId=jumpToPage&listEventParam=100&pagesOfMaxSize=true', cache='cacheStamped', pageStamp=parentPages["stamps"][i]) articlePostsDict = {} articlePostsDict["authors"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-header clear"]/p[@class="author"]/strong/a/text()' ) articlePostsDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-content temporary-class"]', parent=True) articlePostsDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-header clear"]/div/p[@class="permalink"]/a/node()' ) articlePostsDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-header clear"]/div/p[@class="permalink"]/a/@href' ) # teema postituste läbivaatamine for j in parsers_common.article_posts_range( articlePostsDict["urls"], maxArticlePosts): # author articleDataDict["authors"] = parsers_common.list_add( articleDataDict["authors"], j, parsers_common.get(articlePostsDict["authors"], j)) # description articleDataDict["descriptions"] = parsers_common.list_add( articleDataDict["descriptions"], j, parsers_common.get(articlePostsDict["descriptions"], j)) # pubDates magic from "15.01.2012 23:49" to datetime() curArtPubDate = parsers_common.get( articlePostsDict["pubDates"], j) curArtPubDate = parsers_datetime.replace_string_with_timeformat( curArtPubDate, "Eile", "%d.%m.%Y", offsetDays=-1) curArtPubDate = parsers_datetime.add_missing_date_to_string( curArtPubDate, "%d.%m.%Y %H:%M", "%d.%m.%Y ") curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d.%m.%Y %H:%M") articleDataDict["pubDates"] = parsers_common.list_add( articleDataDict["pubDates"], j, curArtPubDate) # title curArtTitle = parsers_common.get(parentPages["titles"], i) curArtTitle = parsers_common.str_title_at_domain( curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add( articleDataDict["titles"], j, curArtTitle) # url curArtUrl = parentPages["urls"][i] + articlePostsDict["urls"][j] articleDataDict["urls"] = parsers_common.list_add( articleDataDict["urls"], j, curArtUrl) rss_print.print_debug( __file__, "teema postitus nr. " + str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) + ") on " + articlePostsDict["urls"][j], 2) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//div[@class="article-content"]/div[@class="article-content__meta"]/span[@class="article-content__date-published"]/text()') articleDataDict["titles"] = parsers_common.xpath_to("list", pageTree, '//div[@class="article-content"]/a[@class="article-content__headline"]/text()') articleDataDict["urls"] = parsers_common.xpath_to("list", pageTree, '//div[@class="article-content"]/a[@class="article-content__headline"]/@href') articleDataDictPubDatesDay = parsers_common.xpath_to("list", pageTree, '//div[@class="article-content"]/div[@class="article-content__meta"]/span[@class="article-content__date-published"]/span/text()') # remove unwanted content: titles dictList = [ "Sakala kuulutused", "Tartu Börs,", "positiivseid proove", ] articleDataDict = parsers_common.article_data_dict_clean(articleDataDict, dictList, "in", "titles") for i in parsers_common.article_urls_range(articleDataDict["urls"]): # pubDates magic from "24.12.2017 17:51" to datetime() curArtPubDateDay = "" if len(articleDataDictPubDatesDay) - 1 >= i: curArtPubDateDay = parsers_common.get(articleDataDictPubDatesDay, i) curArtPubDateDay = parsers_datetime.replace_string_with_timeformat(curArtPubDateDay, "Eile", "%d.%m.%Y", offsetDays=-1) curArtPubDateDay = parsers_datetime.replace_string_with_timeformat(curArtPubDateDay, "Täna", "%d.%m.%Y", offsetDays=0) curArtPubDate = articleDataDict["pubDates"][i] curArtPubDate = parsers_datetime.raw_to_datetime(curArtPubDateDay + curArtPubDate, "%d.%m.%Y, %H:%M") articleDataDict["pubDates"][i] = curArtPubDate if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # author curArtAuthor = parsers_common.xpath_to("single", pageTree, '//span[@class="article-authors__name"]/text()', multi=True) articleDataDict["authors"] = parsers_common.list_add_or_assign(articleDataDict["authors"], i, curArtAuthor) # description1 - enne pilti curArtDesc1 = "" if not curArtDesc1: curArtDesc1 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--video"][1]', parent=True, count=True) if not curArtDesc1: curArtDesc1 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--articleBullets"]', parent=True, count=True) # description2 - pildi ja kuulamise vahel curArtDesc2 = "" if not curArtDesc2: curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//div[@itemprop="articleBody"]', parent=True, count=True) if not curArtDesc2: curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--htmlElement article-body__item--lead"]', parent=True, count=True, multi=True) if not curArtDesc2: curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//span[@class="figure__caption--title"][1]', parent=True, count=True) if not curArtDesc2: curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--htmlElement article-body--first-child article-body__item--lead"]', parent=True, count=True) if not curArtDesc2: curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//div[@itemprop="description"]', parent=True, count=True) # description3 - pärast kuulamist curArtDesc3 = "" if not curArtDesc3: curArtDesc3 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--htmlElement"]', parent=True, count=True, multi=True) if not curArtDesc3: curArtDesc3 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--premium-indicator"]', parent=True, count=True) # description4 - hall väljajuhatus curArtDesc4 = "" if not curArtDesc4: curArtDesc4 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--htmlElement article-body--teaser"]', parent=True, count=True) if not curArtDesc4: curArtDesc4 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--gallery"]', parent=True, count=True, multi=True) # image curArtImg = "" if not curArtImg: curArtImg = parsers_common.xpath_to("single", pageTree, '//div[@class="article-superheader article-superheader--figure"]/div[@class="article-superheader__background"]/@style', count=True) curArtImg = curArtImg.split("url('")[-1].strip("');") if not curArtImg: curArtImg = parsers_common.xpath_to("single", pageTree, '//figure[@class="figure"]/img[@class="figure--has-fullscreen"]/@src', count=True) if not curArtImg: curArtImg = parsers_common.xpath_to("single", pageTree, '//meta[@property="og:image"]/@content', count=True) # kontrollid if "-kuulutused-" in curArtUrl: rss_print.print_debug(__file__, "ei kontrolli plokke, kuna: kuulutused", 2) elif "-karikatuur" in curArtUrl: rss_print.print_debug(__file__, "ei kontrolli plokke, kuna: karikatuur", 2) else: if not curArtDesc1: rss_print.print_debug(__file__, "1. plokk on tühi. (Pildieelne loendiplokk puudub?)", 2) else: rss_print.print_debug(__file__, "curArtDesc1 = " + curArtDesc1, 4) if not curArtDesc2: rss_print.print_debug(__file__, "2. plokk on tühi. (Pildi ja kuulamise vahe plokk puudub?) - " + curArtUrl, 0) else: rss_print.print_debug(__file__, "curArtDesc2 = " + curArtDesc2, 4) if not curArtDesc3: rss_print.print_debug(__file__, "3. plokk on tühi. (Pärast kuulamist plokk puudub?)", 0) else: rss_print.print_debug(__file__, "curArtDesc3 = " + curArtDesc3, 4) if not curArtDesc4: if "button--for-subscription" in curArtDesc3: curArtDesc3 = curArtDesc3.split('<span class="button--for-subscription')[0] rss_print.print_debug(__file__, "4. plokk on tühi. (Kolmandas plokis oli teemant)", 3) else: rss_print.print_debug(__file__, "4. plokk on tühi. (Hall fadeout plokk puudub?)", 2) else: rss_print.print_debug(__file__, "curArtDesc4 = " + curArtDesc4, 4) if not curArtImg: rss_print.print_debug(__file__, "pilti ei leitud.", 0) else: rss_print.print_debug(__file__, "curArtImg = " + curArtImg, 4) if curArtDesc1 and curArtDesc1 == curArtDesc2: rss_print.print_debug(__file__, "1. ja 2. plokk langevad kokku", 0) rss_print.print_debug(__file__, "curArtDesc1 = " + curArtDesc1, 1) rss_print.print_debug(__file__, "curArtDesc2 = " + curArtDesc2, 1) if curArtDesc2 and curArtDesc2 == curArtDesc3: rss_print.print_debug(__file__, "2. ja 3. plokk langevad kokku", 0) rss_print.print_debug(__file__, "curArtDesc2 = " + curArtDesc2, 1) rss_print.print_debug(__file__, "curArtDesc3 = " + curArtDesc3, 1) if curArtDesc3 and curArtDesc3 == curArtDesc4: rss_print.print_debug(__file__, "3. ja 4. plokk langevad kokku", 0) rss_print.print_debug(__file__, "curArtDesc3 = " + curArtDesc3, 1) rss_print.print_debug(__file__, "curArtDesc4 = " + curArtDesc4, 1) if curArtDesc4 and curArtDesc4 == curArtDesc1: rss_print.print_debug(__file__, "4. ja 1. plokk langevad kokku", 0) rss_print.print_debug(__file__, "curArtDesc4 = " + curArtDesc3, 1) rss_print.print_debug(__file__, "curArtDesc1 = " + curArtDesc4, 1) curArtDesc = curArtDesc1 + ' ' + curArtDesc2 + ' ' + curArtDesc3 + ' ' + curArtDesc4 if "button--for-subscription" in curArtDesc: curArtDesc = curArtDesc.replace(' tellijatele', '') curArtDesc = curArtDesc.replace('<a href="https://minumeedia.postimees.ee/kampaania/" target="_blank" class="my-media-link">digipaketi</a>', '') curArtDesc = curArtDesc.replace('<div class="article-body__item article-body__item--audio-teaser">', '<div>') curArtDesc = curArtDesc.replace('<div class="audio-teaser">', '<div>') curArtDesc = curArtDesc.replace('<img data-lazy-src="/v5/img/icons/diamond-black-on-yellow.svg" alt="Tellijale" src="/v5/img/icons/diamond-black-on-yellow.svg" width="30" height="30">', "") curArtDesc = curArtDesc.replace('<img src="/v5/img/icons/diamond-black-on-yellow.svg" alt="Tellijale" width="30" height="30">', "") curArtDesc = curArtDesc.replace('<span class="button--for-subscription">', "<span>") curArtDesc = curArtDesc.replace('<span class="button--for-subscription__diamond diamond--ee">', "<span>") curArtDesc = curArtDesc.replace('<span class="button--for-subscription__text"', "") curArtDesc = curArtDesc.replace('Artikkel on kuulatav', '') curArtDesc = curArtDesc.replace('Tellijale', '') articleDataDict["descriptions"] = parsers_common.list_add_or_assign(articleDataDict["descriptions"], i, curArtDesc) articleDataDict["images"] = parsers_common.list_add_or_assign(articleDataDict["images"], i, curArtImg) return articleDataDict