def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="news-list list"]/li/div[@class="inner"]/a[@class="news-list-link"]/div/p', parent=True) articleDataDict["images"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="news-list list"]/li/div[@class="inner"]/a[@class="news-list-link"]/p/img/@src' ) articleDataDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="news-list list"]/li/div[@class="inner"]/a[@class="news-list-link"]/@title' ) articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="news-list list"]/li/div[@class="inner"]/a[@class="news-list-link"]/div/h3/text()' ) articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="news-list list"]/li/div[@class="inner"]/a[@class="news-list-link"]/@href' ) for i in parsers_common.article_urls_range(articleDataDict["urls"]): # pubDates magic from "03.01.2018 11:09.08 [Tanel]" to datetime() curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = curArtPubDate.split('[')[0] curArtPubDate = parsers_datetime.months_to_int(curArtPubDate) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d. %m %Y %H:%M:%S") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): author = parsers_common.xpath_to("single", pageTree, '//p[@id="band-name-location"]/span[@class="title"]/text()') description = parsers_common.xpath_to("single", pageTree, '//meta[@name="description"]/@content') domain = parsers_common.xpath_to("single", pageTree, '//meta[@property="og:url"]/@content') articleDataDict["images"] = parsers_common.xpath_to("list", pageTree, '//div[@class="leftMiddleColumns"]/ol/li/a/div/img/@src') articleDataDict["titles"] = parsers_common.xpath_to("list", pageTree, '//div[@class="leftMiddleColumns"]/ol/li/a/p/text()') articleDataDict["urls"] = parsers_common.xpath_to("list", pageTree, '//div[@class="leftMiddleColumns"]/ol/li/a/@href') for i in parsers_common.article_urls_range(articleDataDict["urls"]): # author articleDataDict["authors"] = parsers_common.list_add_or_assign(articleDataDict["authors"], i, author) # description articleDataDict["descriptions"] = parsers_common.list_add_or_assign(articleDataDict["descriptions"], i, description) # title curArtTitle = parsers_common.get(articleDataDict["titles"], i) curArtTitle = parsers_common.str_title_at_domain(curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add_or_assign(articleDataDict["titles"], i, curArtTitle) # url curArtUrl = parsers_common.get(articleDataDict["urls"], i) articleDataDict["urls"] = parsers_common.list_add_or_assign(articleDataDict["urls"], i, curArtUrl) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//div[@id="t-content"]/table[1]/tr', parent=True) articleDataDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//div[@id="t-content"]/table[1]/tr/td[1]/text()') articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@id="t-content"]/table[1]/tr/td[@class="left"]/b/a/text()') articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@id="t-content"]/table[1]/tr/td[@class="left"]/b/a/@href') # remove unwanted content: descriptions dictList = ["Tartu"] articleDataDict = parsers_common.article_data_dict_clean( articleDataDict, dictList, "not in", "descriptions") for i in parsers_common.article_urls_range(articleDataDict["urls"]): # pubDates magic from "29.08.19" to datetime() curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d.%m.%y") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["images"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="product"]/div[@class="image-cell"]/img/@src') articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="product"]/div[@class="description-cell"]/h2/text()') articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="product"]/div[@class="description-cell"]/a/@href') articlePrices = parsers_common.xpath_to( "list", pageTree, '//div[@class="product"]/div[@class="price-cell"]', parent=True) articleDescriptions = parsers_common.xpath_to( "list", pageTree, '//div[@class="product"]/div[@class="description-cell"]', parent=True) for i in parsers_common.article_urls_range(articleDataDict["urls"]): # description curArtDesc = parsers_common.get(articlePrices, i) + parsers_common.get( articleDescriptions, i) articleDataDict["descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["authors"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="message"]/div[@class="name"]', parent=True) articleDataDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="message"]/div[@class="content"]', parent=True) articleDataDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="message"]/div[@class="posttime"]/text()') articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="message"]/div[@class="title"]/a[3]/text()') articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="message"]/div[@class="title"]/a[3]/@href') # remove unwanted content: titles dictList = [ "Hoiatus! Läbisõit 100% keritud", "Kaebused", ] articleDataDict = parsers_common.article_data_dict_clean( articleDataDict, dictList, "in", "titles") for i in parsers_common.article_urls_range(articleDataDict["urls"]): # description curArtDesc = parsers_common.get(articleDataDict["descriptions"], i) curArtDesc = curArtDesc.split('<div class="userControls')[0] articleDataDict["descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) # title curArtTitle = parsers_common.get(articleDataDict["titles"], i) curArtTitle = parsers_common.str_lchop(curArtTitle, "Re: ") curArtTitle = parsers_common.str_title_at_domain(curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add_or_assign( articleDataDict["titles"], i, curArtTitle) # pubDates magic from "20:22 01.09.2019" to Datetime() curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = parsers_datetime.replace_string_with_timeformat( curArtPubDate, "eile", "%d.%m.%Y", offsetDays=-1) curArtPubDate = parsers_datetime.replace_string_with_timeformat( curArtPubDate, "täna", "%d.%m.%Y", offsetDays=0) curArtPubDate = parsers_datetime.add_missing_date_to_string( curArtPubDate, "%H:%M %d.%m.%Y", " %d.%m.%Y") curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%H:%M %d.%m.%Y") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//body/div[2]/div[1]/main/div/div[3]/div/div/a/h2/text()') articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//body/div[2]/div[1]/main/div/div[3]/div/div/a/@href') for i in parsers_common.article_urls_range(articleDataDict["urls"]): if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # description curArtDesc = parsers_common.xpath_to("single", pageTree, '//div[@class="col-md-12"]', parent=True) if not curArtDesc: curArtDesc = parsers_common.xpath_to( "single", pageTree, '//div[@class="col-md-8"]', parent=True) if not curArtDesc: curArtDesc = parsers_common.xpath_to( "single", pageTree, '//div[@class="img-open-area basic-content pb-4"]', parent=True) articleDataDict[ "descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) # image curArtImg = parsers_common.xpath_to( "single", pageTree, '//a[@data-lightbox="treimages"][1]/@href') articleDataDict["images"] = parsers_common.list_add_or_assign( articleDataDict["images"], i, curArtImg) # pubDates magic from "7. märts 2021" to datetime() curArtPubDate = parsers_common.xpath_to( "single", pageTree, '//div[@class="col-sm-12 col-md-auto text-sm-center text-md-right"]/div[@class="mt-1 text-uppercase"]/small/text()' ) curArtPubDate = parsers_datetime.months_to_int(curArtPubDate) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d. %m %Y") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '/html/body/div[3]/div/div[1]/div[@class="sb-article"]/a/div[@class="sb-article-cnt"]/div[@class="sb-article-prolog"]', parent=True) articleDataDict["images"] = parsers_common.xpath_to( "list", pageTree, '/html/body/div[3]/div/div[1]/div[@class="sb-article"]/a/div[@class="sb-article-image"]/@style' ) articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '/html/body/div[3]/div/div[1]/div[@class="sb-article"]/a/div[@class="sb-article-cnt"]/div[@class="sb-article-title"]/h3/text()' ) articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '/html/body/div[3]/div/div[1]/div[@class="sb-article"]/a/@href') for i in parsers_common.article_urls_range(articleDataDict["urls"]): if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # author curArtAuthor = parsers_common.xpath_to( "single", pageTree, '//div[@class="sg-article-details"]/div[@class="author"]/text()' ) articleDataDict["authors"] = parsers_common.list_add_or_assign( articleDataDict["authors"], i, curArtAuthor) # description curArtDesc = parsers_common.xpath_to( "single", pageTree, '/html/body/div[3]/div/div[@class="page-content"]/div[@class="sg-article"]/div[@class="sg-article-text"]', parent=True) articleDataDict[ "descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) # pubDates magic from "18.08.2019 21:35" to datetime() curArtPubDate = parsers_common.xpath_to( "single", pageTree, '//div[@class="sg-article-details"]/div[@class="date"]/text()') curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d.%m.%Y %H:%M") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="box-news-block-title "]/a[@title]/@title') articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="box-news-block-title "]/a[@title]/@href') # remove unwanted content: urls dictList = [ "https://sky.ee/rock-fmi-hommikuprogramm-igal-toopaeval-kell-7-10/" ] articleDataDict = parsers_common.article_data_dict_clean( articleDataDict, dictList, "in", "urls") for i in parsers_common.article_urls_range(articleDataDict["urls"]): if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # author curArtAuthor = parsers_common.xpath_to( "single", pageTree, '//div[@class="post-content"]/div[@class="article-page-author"]/p/text()' ) if ": " in curArtAuthor: curArtAuthor = curArtAuthor.split(": ")[1] articleDataDict["authors"] = parsers_common.list_add_or_assign( articleDataDict["authors"], i, curArtAuthor) # description curArtDesc1 = parsers_common.xpath_to( "single", pageTree, '//div[@class="posts"]/div[@class="two-side-content-container clearfix"][1]//div[@class="post-content"]/strong/p/text()' ) curArtDesc2 = parsers_common.xpath_to( "single", pageTree, '//div[@class="posts"]/div[@class="two-side-content-container clearfix"][2]//div[@class="post-content"]', parent=True) curArtDesc = curArtDesc1 + curArtDesc2 if '<p class="related-cta">LOE KA NEID UUDISEID!</p>' in curArtDesc: curArtDesc = curArtDesc.split( '<p class="related-cta">LOE KA NEID UUDISEID!</p>')[0] articleDataDict[ "descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="applied-jobs"]/div/div[@class="job-content"]/div[@class="left-content"]/p', parent=True) articleDataDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="applied-jobs"]/div/div[@class="job-content"]/div[@class="right-content"]/div[@class="application-date"][1]/text()' ) articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="applied-jobs"]/div/div[@class="job-content"]/div[@class="left-content"]/h2/a/text()' ) articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="applied-jobs"]/div/div[@class="job-content"]/div[@class="left-content"]/h2/a/@href' ) # remove unwanted content: descriptions dictList = [ "jurist", "logopeed", "pedagoog", "psühholoog", "raamatupidaja", "sotsiaal", "õpetaja", ] articleDataDict = parsers_common.article_data_dict_clean( articleDataDict, dictList, "in", "titles") for i in parsers_common.article_urls_range(articleDataDict["urls"]): # pubDates magic from "Avaldatud: 12.12.2017" to datetime() curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = curArtPubDate.split(': ')[1] curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d.%m.%Y") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) # url curArtUrl = parsers_common.get(articleDataDict["urls"], i) curArtUrl = curArtUrl.split('?ref=')[0] articleDataDict["urls"] = parsers_common.list_add_or_assign( articleDataDict["urls"], i, curArtUrl) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["authors"] = parsers_common.xpath_to( "list", pageTree, '//section/article/h2/a[2]/text()') articleDataDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//section/article/time/text()') articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//section/article/h2/a[@itemprop="url"]/text()') articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//section/article/h2/a[@itemprop="url"]/@href') for i in parsers_common.article_urls_range(articleDataDict["urls"]): # pubDates magic from "2021-01-06T10:11:04Z" to datetime() curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%Y-%m-%dt%H:%M:%S%z") articleDataDict["pubDates"][i] = curArtPubDate if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache="cacheAll") # description curArtDesc = parsers_common.xpath_to("single", pageTree, '//body/div//article', parent=True) articleDataDict[ "descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) # image curArtImg = parsers_common.xpath_to( "single", pageTree, '//body/div//article/p/img/@src') articleDataDict["images"] = parsers_common.list_add_or_assign( articleDataDict["images"], i, curArtImg) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["images"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="col-sm-6"]/div[@class="post-item"]/a/div/img/@src') articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="col-sm-6"]/div[@class="post-item"]/a/h3/text()') articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="col-sm-6"]/div[@class="post-item"]/a/@href') for i in parsers_common.article_urls_range(articleDataDict["urls"]): if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # descriptions curArtDesc = parsers_common.xpath_to("single", pageTree, '//div[@class="col-sm-9"]/p', multi=True) articleDataDict[ "descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) # timeformat magic from "Avaldatud: 14 detsember, 2017" to datetime() curArtPubDate = parsers_common.xpath_to( "single", pageTree, '//div[@class="col-sm-9"]/div[@class="page-header"]/em/text()') curArtPubDate = parsers_datetime.months_to_int( curArtPubDate.split(':')[1]) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d %m, %Y") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["images"] = parsers_common.xpath_to( "list", pageTree, '//article/a/div/div/img/@src') articleDataDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//article/a/div[@class="node__body"]/p[@class="node__date"]/span/@content' ) articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//article/a/div[@class="node__body"]/h3/span/text()') articleDataDict["urls"] = parsers_common.xpath_to("list", pageTree, '//article/a/@href') for i in parsers_common.article_urls_range(articleDataDict["urls"]): # pubDates magic from "2021-03-23T12:35:36+00:00" to datetime() curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%Y-%m-%dt%H:%M:%S%z") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # description curArtDesc = parsers_common.xpath_to( "single", pageTree, '//div[@class="node__content"]/div/div[@class="field__item"]', parent=True) articleDataDict[ "descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="js-newsline-container"]/span[1]/text()') articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="js-newsline-container"]/div/a/text()') articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="js-newsline-container"]/div/a/@href') for i in parsers_common.article_urls_range(articleDataDict["urls"]): # pubDates magic from "14 dets 2017 11:34" to datetime() curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = parsers_datetime.months_to_int(curArtPubDate) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d %m %Y %H:%M") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # description curArtDesc = parsers_common.xpath_to( "single", pageTree, '//div[@class="news-preview"]/div/text()') if not curArtDesc: curArtDesc = parsers_common.xpath_to( "single", pageTree, '//div[@class="content_item"]', parent=True) articleDataDict[ "descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["images"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="product_camp_box w"]/a/div[@class="product_camp"]/div[@class="leftC"]/div/img/@data-src' ) articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="product_camp_box w"]/a/div[@class="product_camp"]/div[@class="leftC"]/h3/text()' ) articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="product_camp_box w"]/a/@href') articlePrices = parsers_common.xpath_to( "list", pageTree, '//div[@class="product_camp_box w"]/div[@class="priceCont"]', parent=True) articleDescriptions = parsers_common.xpath_to( "list", pageTree, '//div[@class="product_camp_box w"]/a/div[@class="product_camp"]/div[@class="leftC"]', parent=True) for i in parsers_common.article_urls_range(articleDataDict["urls"]): # description curArtDesc = articlePrices[i] + articleDescriptions[i] curArtDesc = curArtDesc.replace( '<img src="https://www.stokker.ee/gfx/blank.gif">', "") articleDataDict["descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) # image curArtImg = parsers_common.get(articleDataDict["images"], i) curArtImg = curArtImg.split("?")[0] curArtImg = curArtImg.replace("%26", "&") articleDataDict["images"] = parsers_common.list_add_or_assign( articleDataDict["images"], i, curArtImg) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["authors"] = parsers_common.xpath_to( "list", pageTree, '//section/div/div/div/div/div/article/div/div/div/div[2]/div[2]/div[1]/div/div/div[1]/div[1]/a/div/div[1]/div[1]/span/span/text()' ) articleDataDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//section/div/div/div/div/div/article/div/div/div/div[2]/div[2]/div[2]/div[1]/div/span[1]', parent=True) articleDataDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//section/div/div/div/div/div/article/div/div/div/div[2]/div[2]/div[1]/div/div/div[1]/a/time/@datetime' ) articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//section/div/div/div/div/div/article/div/div/div/div[2]/div[2]/div[1]/div/div/div[1]/div[1]/a/div/div[1]/div[1]/span/span/text()' ) articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//section/div/div/div/div/div/article/div/div/div/div[2]/div[2]/div[1]/div/div/div[1]/a/@href' ) for i in parsers_common.article_urls_range(articleDataDict["urls"]): # pubDates 2021-02-12T16:08:02.000Z curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%Y-%m-%dT%H:%M:%S.000Z") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) # title curArtTitle = parsers_common.get(articleDataDict["titles"], i) curArtTitle = parsers_common.str_title_at_domain(curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add_or_assign( articleDataDict["titles"], i, curArtTitle) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//li[@class="b-posts__list-item"]/p[@class="b-posts__list-item-summary"]/text()') articleDataDict["titles"] = parsers_common.xpath_to("list", pageTree, '//li[@class="b-posts__list-item"]/h2[@class="b-posts__list-item-title"]/a/text()') articleDataDict["urls"] = parsers_common.xpath_to("list", pageTree, '//li[@class="b-posts__list-item"]/h2[@class="b-posts__list-item-title"]/a/@href') for i in parsers_common.article_urls_range(articleDataDict["urls"]): # pubDates magic from "30.01.2021" to datetime() curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = parsers_datetime.raw_to_datetime(curArtPubDate, "%d.%m.%Y") articleDataDict["pubDates"] = parsers_common.list_add_or_assign(articleDataDict["pubDates"], i, curArtPubDate) if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # description curArtDesc = parsers_common.xpath_to("single", pageTree, '//div[@class="b-article"]', parent=True) articleDataDict["descriptions"] = parsers_common.list_add_or_assign(articleDataDict["descriptions"], i, curArtDesc) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 10) maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX / maxArticleBodies) # set 0 for all posts parentPages = {} parentPages["stamps"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="reply-count"]/span[@data-xf-init="tooltip"]/text()') parentPages["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@qid="thread-item-parent"]/div[@qid="thread-item"]/div/div[@class="structItem-title"]/a[@qid="thread-item-title"]/text()' ) parentPages["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@qid="thread-item-parent"]/div[@qid="thread-item"]/div/div[@class="structItem-title"]/a[@qid="thread-item-title"]/@href' ) # remove unwanted content: titles dictList = [ "$", "*:::::::the official what did you do to you mkiv today thread::::::::*", "??", "Ask a Simple Question", ] parentPages = parsers_common.article_data_dict_clean( parentPages, dictList, "in", "titles") # teemade läbivaatamine for i in parsers_common.article_urls_range(parentPages["urls"]): # teemalehe sisu hankimine if parsers_common.should_get_article_body(i, maxArticleBodies): curParentUrl = parsers_common.get(parentPages["urls"], i) curParentUrl = curParentUrl + "page-1000" parentPagesStamp = parsers_common.get(parentPages["stamps"], i) # load article into tree pageTree = parsers_common.get_article_tree( domain, curParentUrl, cache='cacheStamped', pageStamp=parentPagesStamp) articlePostsDict = {} articlePostsDict["authors"] = parsers_common.xpath_to( "list", pageTree, '//h4[@qid="message-username"]//text()') articlePostsDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//article/div[@class="bbWrapper"]', parent=True) articlePostsDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="message-attribution-main"]/a[@class="u-concealed"][2]/time/@datetime' ) articlePostsDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="message-attribution-main"]/a[@class="u-concealed"][2]/@href' ) # teema postituste läbivaatamine for j in parsers_common.article_posts_range( articlePostsDict["urls"], maxArticlePosts): # author articleDataDict["authors"] = parsers_common.list_add( articleDataDict["authors"], j, parsers_common.get(articlePostsDict["authors"], j)) # description curArtDesc = parsers_common.get( articlePostsDict["descriptions"], j) articleDataDict["descriptions"] = parsers_common.list_add( articleDataDict["descriptions"], j, curArtDesc) # pubDates curArtPubDate = parsers_common.get( articlePostsDict["pubDates"], j) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%Y-%m-%dT%H:%M:%S%z") # 2021-01-28T16:15:42-0500 articleDataDict["pubDates"] = parsers_common.list_add( articleDataDict["pubDates"], j, curArtPubDate) # title curArtTitle = parsers_common.get(parentPages["titles"], i) curArtTitle = parsers_common.str_title_at_domain( curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add( articleDataDict["titles"], j, curArtTitle) # url curArtUrl = parsers_common.get(articlePostsDict["urls"], j) articleDataDict["urls"] = parsers_common.list_add( articleDataDict["urls"], j, curArtUrl) rss_print.print_debug( __file__, "teema " + str(i + 1) + " postitus nr. " + str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) + ") on " + articlePostsDict["urls"][j], 2) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 5) maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX / maxArticleBodies) # set 0 for all posts parentPages = {} parentPages["stamps"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="grid zebra forum"]/tr/td[@class="meta"][4]/span/text()' ) parentPages["titles"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="grid zebra forum"]/tr/td[@class="title"]/a/@title') parentPages["urls"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="grid zebra forum"]/tr/td[@class="title"]/a/@href') # remove unwanted content: titles dictList = [ "Börsihai", "Cleveroni aktsiate ost/müük/oksjon", "Head uut aastat – prognoosid", "Keegi malet soovib mängida", "LHV Pank paremaks", "Uurimis- ja lõputööde küsimustikud", ] parentPages = parsers_common.article_data_dict_clean( parentPages, dictList, "in", "titles") # teemade läbivaatamine for i in parsers_common.article_urls_range(parentPages["urls"]): # teemalehe sisu hankimine if parsers_common.should_get_article_body(i, maxArticleBodies): # load article into tree pageTree = parsers_common.get_article_tree( domain, parentPages["urls"][i] + '?listEventId=jumpToPage&listEventParam=100&pagesOfMaxSize=true', cache='cacheStamped', pageStamp=parentPages["stamps"][i]) articlePostsDict = {} articlePostsDict["authors"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-header clear"]/p[@class="author"]/strong/a/text()' ) articlePostsDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-content temporary-class"]', parent=True) articlePostsDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-header clear"]/div/p[@class="permalink"]/a/node()' ) articlePostsDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-header clear"]/div/p[@class="permalink"]/a/@href' ) # teema postituste läbivaatamine for j in parsers_common.article_posts_range( articlePostsDict["urls"], maxArticlePosts): # author articleDataDict["authors"] = parsers_common.list_add( articleDataDict["authors"], j, parsers_common.get(articlePostsDict["authors"], j)) # description articleDataDict["descriptions"] = parsers_common.list_add( articleDataDict["descriptions"], j, parsers_common.get(articlePostsDict["descriptions"], j)) # pubDates magic from "15.01.2012 23:49" to datetime() curArtPubDate = parsers_common.get( articlePostsDict["pubDates"], j) curArtPubDate = parsers_datetime.replace_string_with_timeformat( curArtPubDate, "Eile", "%d.%m.%Y", offsetDays=-1) curArtPubDate = parsers_datetime.add_missing_date_to_string( curArtPubDate, "%d.%m.%Y %H:%M", "%d.%m.%Y ") curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d.%m.%Y %H:%M") articleDataDict["pubDates"] = parsers_common.list_add( articleDataDict["pubDates"], j, curArtPubDate) # title curArtTitle = parsers_common.get(parentPages["titles"], i) curArtTitle = parsers_common.str_title_at_domain( curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add( articleDataDict["titles"], j, curArtTitle) # url curArtUrl = parentPages["urls"][i] + articlePostsDict["urls"][j] articleDataDict["urls"] = parsers_common.list_add( articleDataDict["urls"], j, curArtUrl) rss_print.print_debug( __file__, "teema postitus nr. " + str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) + ") on " + articlePostsDict["urls"][j], 2) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="grid-main--item "]/div/div[2]/div[1]/h6/a[1]/text()') articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="grid-main--item "]/div/div[2]/div[1]/h6/a[1]/@href') for i in parsers_common.article_urls_range(articleDataDict["urls"]): # titles curArtTitle = parsers_common.get(articleDataDict["titles"], i) curArtTitle = parsers_common.str_remove_clickbait(curArtTitle) articleDataDict["titles"] = parsers_common.list_add_or_assign( articleDataDict["titles"], i, curArtTitle) if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # author curArtAuthor = parsers_common.xpath_to( "single", pageTree, '//span[@class="author"]/text()') articleDataDict["authors"] = parsers_common.list_add_or_assign( articleDataDict["authors"], i, curArtAuthor) # description curArtDesc1 = parsers_common.xpath_to( "single", pageTree, '//div[@class="page-layout--block"][1]/div[@class="page-layout--content"]/div[@class="page-layout--inner"]/div[@class="article-main--content article-main--excerpt formatted--content"]', parent=True) if not curArtDesc1: curArtDesc1 = parsers_common.xpath_to( "single", pageTree, '//div[@class="page-layout--content"]/div[@class="page-layout--inner"]/div[@class="article-main--content article-main--excerpt formatted--content"]', parent=True) curArtDesc2 = parsers_common.xpath_to( "single", pageTree, '//div[@class="page-layout--block"][2]/div[@class="page-layout--content"]/div[@class="page-layout--inner"]', parent=True, multi=True) curArtDesc = curArtDesc1 + curArtDesc2 curArtDesc = curArtDesc.split("Edasi lugemiseks")[0] curArtDesc = curArtDesc.split("Jaga:")[0] curArtDesc = curArtDesc.split("Samal teemal")[0] articleDataDict[ "descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) # image curArtImg = parsers_common.xpath_to( "single", pageTree, '//div[@class="page-layout--block"][1]//div[@class="image-gallery-image first-in-gallery"][1]/picture[1]/img[@class="article-image"]/@src' ) if not curArtImg: curArtImg = parsers_common.xpath_to( "single", pageTree, '//div[@class="part"][1]/div/p/img/@src') articleDataDict["images"] = parsers_common.list_add_or_assign( articleDataDict["images"], i, curArtImg) # pubDates from "täna 16:53" to datetime() curArtPubDate = parsers_common.xpath_to( "single", pageTree, '//div[@class="details--inner"]/text()') curArtPubDate = curArtPubDate.split(",")[-1] curArtPubDate = parsers_datetime.replace_string_with_timeformat( curArtPubDate, "eile", "%d. %m %Y ", offsetDays=-1) curArtPubDate = parsers_datetime.replace_string_with_timeformat( curArtPubDate, "täna", "%d. %m %Y ", offsetDays=0) curArtPubDate = parsers_datetime.months_to_int(curArtPubDate) curArtPubDate = parsers_datetime.raw_to_datetime( curArtPubDate, "%d. %m %Y %H:%M") articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="list-item"]/div[@class="details"]', parent=True) articleDataDict["images"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="list-item"]/a[@class="image"]/@style') articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="list-item"]/div[@class="details"]/a[1]/text()') articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="list-item"]/div[@class="details"]/a[1]/@href') # remove unwanted content: titles dictList = [ "edasi lükatud", "ei toimu", "jääb ära", "lükkub edasi", "tühistatud", ] articleDataDict = parsers_common.article_data_dict_clean( articleDataDict, dictList, "in", "titles") for i in parsers_common.article_urls_range(articleDataDict["urls"]): # image curArtImage = parsers_common.get(articleDataDict["images"], i) curArtImage = parsers_common.split_failsafe(curArtImage, "'", 1) articleDataDict["images"] = parsers_common.list_add_or_assign( articleDataDict["images"], i, curArtImage) # url curArtUrl = parsers_common.get(articleDataDict["urls"], i) curArtUrl = curArtUrl.split("?event")[0] articleDataDict["urls"] = parsers_common.list_add_or_assign( articleDataDict["urls"], i, curArtUrl) if parsers_common.should_get_article_body(i): # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # description curArtDesc1 = parsers_common.get(articleDataDict["descriptions"], i) curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//article', parent=True) curArtDesc2 = curArtDesc2.replace( "<h5 class=\"sm-hide\">Galerii</h5>", "") curArtDesc2 = curArtDesc2.replace( "<h5 class=\"sm-hide\">Tutvustus</h5>", "") curArtDesc2 = curArtDesc2.replace( "<div class=\"link after-arrow_down sm-show\">Loe lähemalt</div>", "") curArtDesc = curArtDesc1 + curArtDesc2 articleDataDict[ "descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) # muudame suuna sobivaks articleDataDict = parsers_common.dict_reverse_order(articleDataDict) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 5) maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX / maxArticleBodies) # set 0 for all posts parentPages = {} parentPages["stamps"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="tabs-content tab-active shadow"]/ul[@class="adds-list list"]/li[@class="adds-list-item"]/a[h3/@class="adds-list-title"]/h3[@class="adds-list-title"]/span[@class="adds-list-meta"]/text()' ) parentPages["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="tabs-content tab-active shadow"]/ul[@class="adds-list list"]/li[@class="adds-list-item"]/a[h3/@class="adds-list-title"]/h3[@class="adds-list-title"]/text()' ) parentPages["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="tabs-content tab-active shadow"]/ul[@class="adds-list list"]/li[@class="adds-list-item"]/a[h3/@class="adds-list-title"]/@href' ) # muudame suuna sobivaks parentPages = parsers_common.dict_reverse_order(parentPages) # remove unwanted content: titles dictList = [ "AMD", "Apple", "Assassin", "Batman", "Battlefield", "Call of Duty", "Cyberpunk", "Diablo 2", "Dying", "Escape From Tarkov", "Euro Truck", "Evil", "FIFA", "Far Cry", "Forza", "Galaxy", "Grand Theft", "IPhon", "Kindle", "MMORPG", "MSI", "MacBook", "MacOS", "Mafia", "Mass Effect", "Meizu", "Minecraft", "Nintendo", "Pixel", "PlayStation", "Steam", "Tanks", "Vidia", "War Thunder", "Watercool", "Windows", "Xbox", "arvutikast", "exile", "foorumiga seotud", "konsool", "korpust", "moderaatorite", "seotud vead", "siia lingid", "toiteplok", ] parentPages = parsers_common.article_data_dict_clean( parentPages, dictList, "in", "titles") # teemade läbivaatamine for i in parsers_common.article_urls_range(parentPages["urls"]): # teemalehe sisu hankimine if parsers_common.should_get_article_body(i, maxArticleBodies): parentPages["stamps"][i] = parentPages["stamps"][i].split("/")[0] parentPages["urls"][i] = parentPages["urls"][i].split("&sid=")[0] pageTree = parsers_common.get_article_tree( domain, parentPages["urls"][i], cache='cacheStamped', pageStamp=parentPages["stamps"][i]) articlePostsDict = {} articlePostsDict["authors"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="forumline"]/tr/td[1]/span[@class="name"]/b/a/text()' ) articlePostsDict["descriptions1"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="forumline"]/tr/td[2]/table/tr[3]/td/span[@class="postbody"][1]', parent=True) articlePostsDict["descriptions2"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="forumline"]/tr/td[2]/table/tr[3]/td/span[@class="postbody"][2]', parent=True) articlePostsDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="forumline"]/tr/td[2]/table/tr[1]/td/span[@class="postdetails"]/span[@class="postdetails"][1]/text()[1]' ) articlePostsDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//table[@class="forumline"]/tr/td[2]/table/tr[1]/td/span[@class="postdetails"]/a/@href' ) # teema postituste läbivaatamine for j in parsers_common.article_posts_range( articlePostsDict["urls"], maxArticlePosts): # author articleDataDict["authors"] = parsers_common.list_add( articleDataDict["authors"], j, parsers_common.get(articlePostsDict["authors"], j)) # description curArtDesc = parsers_common.get( articlePostsDict["descriptions1"], j) if not curArtDesc: curArtDesc = parsers_common.get( articlePostsDict["descriptions2"], j) curArtDesc = curArtDesc.replace( '</div><div class="quotecontent">', '<br>') curArtDesc = parsers_common.fix_quatation_tags( curArtDesc, '<div class="quotetitle">', "</div>", "<blockquote>", "</blockquote>") articleDataDict["descriptions"] = parsers_common.list_add( articleDataDict["descriptions"], j, curArtDesc) # pubDates curArtPubDate = parsers_common.get( articlePostsDict["pubDates"], j) curArtPubDate = curArtPubDate[0:16] curArtPubDate = parsers_datetime.guess_datetime( curArtPubDate) # 14.07.2020 07:59 articleDataDict["pubDates"] = parsers_common.list_add( articleDataDict["pubDates"], j, curArtPubDate) # title curArtTitle = parsers_common.get(parentPages["titles"], i) curArtTitle = parsers_common.str_title_at_domain( curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add( articleDataDict["titles"], j, curArtTitle) # url curArtUrl = parsers_common.get(articlePostsDict["urls"], j) curArtUrl = parsers_common.link_add_end( curArtUrl, articlePostsDict["urls"][j]) articleDataDict["urls"] = parsers_common.list_add( articleDataDict["urls"], j, curArtUrl) rss_print.print_debug( __file__, "teema " + str(i + 1) + " postitus nr. " + str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) + ") on " + articlePostsDict["urls"][j], 2) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="history-item"]/span[@class="history-date ng-binding"]/text()' ) articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="history-item"]/p[@class="history-header"]/a[@class="ng-binding"]/text()[1]' ) articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="history-item"]/p[@class="history-header"]/a[@class="ng-binding"]/@href' ) lastArtPubDate = "" iMinus = 0 # remove unwanted content: titles dictList = [ " reket", " võidu", "Aktuaalne kaamera", "ERR-i teleuudised", "ETV spordi", "Ilm ", "OTSE ", "Päevakaja", "Raadiouudised", "Viipekeelsed uudised", ] articleDataDict = parsers_common.article_data_dict_clean( articleDataDict, dictList, "in", "titles") for i in parsers_common.article_urls_range(articleDataDict["urls"]): i = i - iMinus # pubDates magic from "11:34" to datetime() curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i) curArtPubDate = parsers_datetime.raw_to_datetime_guess_missing( curArtPubDate, lastArtPubDate, "%Y %m %d ", "%H:%M", -1) lastArtPubDate = curArtPubDate articleDataDict["pubDates"] = parsers_common.list_add_or_assign( articleDataDict["pubDates"], i, curArtPubDate) # title curArtTitle = parsers_common.get(articleDataDict["titles"], i) if "|" in curArtTitle: curArtTitle = curArtTitle.split("|")[-1] articleDataDict["titles"] = parsers_common.list_add_or_assign( articleDataDict["titles"], i, curArtTitle) if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # author curArtAuthor = parsers_common.xpath_to( "single", pageTree, '//article/div[@class="body"]/div/div[@class="meta"]/section/div[@class="byline"]/span/text()' ) articleDataDict["authors"] = parsers_common.list_add_or_assign( articleDataDict["authors"], i, curArtAuthor) # description curArtDesc1 = parsers_common.xpath_to( "single", pageTree, '//article/div[@class="body"]/div[@class="lead"]', parent=True) curArtDesc2 = parsers_common.xpath_to( "single", pageTree, '//article/div[@class="body"]/div[@class="text flex-row"]', parent=True) if not curArtDesc2: rss_print.print_debug( __file__, "tühja sisuga uudis, eemaldame rea " + str(i), 1) articleDataDict = parsers_common.dict_del_article_index( articleDataDict, i) iMinus += 1 continue curArtDesc = curArtDesc1 + "<br>" + curArtDesc2 articleDataDict[ "descriptions"] = parsers_common.list_add_or_assign( articleDataDict["descriptions"], i, curArtDesc) # image curArtImg = parsers_common.xpath_to( "single", pageTree, '/html/head/meta[@property="og:image"]/@content') if not curArtImg: curArtImg = parsers_common.xpath_to( "single", pageTree, '/html/head/meta[@property="image"]/@content') articleDataDict["images"] = parsers_common.list_add_or_assign( articleDataDict["images"], i, curArtImg) articleDataDict = parsers_common.dict_reverse_order(articleDataDict) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 5) maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX / maxArticleBodies) # set 0 for all posts parentPages = {} parentPages["stamps"] = parsers_common.xpath_to( "list", pageTree, '//div[@data-lang="Vastuseid"]/a/text()') parentPages["titles"] = parsers_common.xpath_to( "list", pageTree, '//span[@class=" subject_old"]/a/text()') parentPages["urls"] = parsers_common.xpath_to( "list", pageTree, '//span[@class=" subject_old"]/a/@href') # remove unwanted content: titles #dictList = [] #parentPages = parsers_common.article_data_dict_clean(parentPages, dictList, "in", "titles") # teemade läbivaatamine for i in parsers_common.article_urls_range(parentPages["urls"]): # teemalehe sisu hankimine if parsers_common.should_get_article_body(i, maxArticleBodies): curParentUrl = parsers_common.get(parentPages["urls"], i) curParentUrl = curParentUrl.split("&sid=")[0] curParentUrl = curParentUrl + "&page=-1" curParentUrl = parsers_common.str_domain_url(domain, curParentUrl) parentPagesStamp = parsers_common.get(parentPages["stamps"], i) # load article into tree pageTree = parsers_common.get_article_tree( domain, curParentUrl, cache='cacheStamped', pageStamp=parentPagesStamp) articlePostsDict = {} articlePostsDict["authors"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="author_information"]/strong/span[@class="largetext"]/a/text()' ) articlePostsDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="post_body scaleimages"]', parent=True) articlePostsDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="post_head"]/span[@class="post_date"]', parent=True) articlePostsDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="post_content"]/div[@class="post_head"]/div/strong/a/@href' ) # teema postituste läbivaatamine for j in parsers_common.article_posts_range( articlePostsDict["urls"], maxArticlePosts): # author articleDataDict["authors"] = parsers_common.list_add( articleDataDict["authors"], j, parsers_common.get(articlePostsDict["authors"], j)) # description curArtDesc = parsers_common.get( articlePostsDict["descriptions"], j) curArtDesc = curArtDesc.replace( '</div><div class="quotecontent">', '<br>') curArtDesc = parsers_common.fix_quatation_tags( curArtDesc, '<div class="quotetitle">', "</div>", "<blockquote>", "</blockquote>") articleDataDict["descriptions"] = parsers_common.list_add( articleDataDict["descriptions"], j, curArtDesc) # pubDates curArtPubDate = parsers_common.get( articlePostsDict["pubDates"], j) curArtPubDate = parsers_common.str_lchop( curArtPubDate, '<span title="') curArtPubDate = curArtPubDate.split(" <span class")[0] if "Eile" in curArtPubDate or "Täna" in curArtPubDate: curArtPubDate = curArtPubDate.split('">')[1] curArtPubDate = parsers_datetime.replace_string_with_timeformat( curArtPubDate, "Eile</span>", "%d-%m-%Y", offsetDays=-1) curArtPubDate = parsers_datetime.replace_string_with_timeformat( curArtPubDate, "Täna</span>", "%d-%m-%Y", offsetDays=0) else: curArtPubDate = curArtPubDate.split('">')[0] curArtPubDate = parsers_datetime.guess_datetime(curArtPubDate) articleDataDict["pubDates"] = parsers_common.list_add( articleDataDict["pubDates"], j, curArtPubDate) # title curArtTitle = parsers_common.get(parentPages["titles"], i) curArtTitle = parsers_common.str_title_at_domain( curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add( articleDataDict["titles"], j, curArtTitle) # url curArtUrl = parsers_common.get(articlePostsDict["urls"], j) curArtUrl = parsers_common.link_add_end( curArtUrl, articlePostsDict["urls"][j]) articleDataDict["urls"] = parsers_common.list_add( articleDataDict["urls"], j, curArtUrl) rss_print.print_debug( __file__, "teema " + str(i + 1) + " postitus nr. " + str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) + ") on " + articlePostsDict["urls"][j], 2) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 8) maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX / maxArticleBodies) # set 0 for all posts parentPages = {} version = 0 if "kipper.ee" in domain or "militaar.net" in domain: version = 1 parentPages["stamps"] = parsers_common.xpath_to("list", pageTree, '//table[@class="tablebg"]//tr/td/p[@class="topicdetails"]/text()') parentPages["titles"] = parsers_common.xpath_to("list", pageTree, '//table[@class="tablebg"]//tr/td/a[@class="topictitle"]/text()') parentPages["urls"] = parsers_common.xpath_to("list", pageTree, '//table[@class="tablebg"]//tr/td/a[@class="topictitle"]/@href') else: parentPages["stamps"] = parsers_common.xpath_to("list", pageTree, '//ul[2]/li/dl/dd[@class="posts"]/text()') parentPages["titles"] = parsers_common.xpath_to("list", pageTree, '//ul[2]/li/dl/*/div/a[@class="topictitle"]/text()') parentPages["urls"] = parsers_common.xpath_to("list", pageTree, '//ul[2]/li/dl/*/div/a[@class="topictitle"]/@href') if not parentPages["urls"] and "arutelud.com" in domain: rss_print.print_debug(__file__, "aktiivseid teemasid ei leitud, arutelude foorumis külastame mammutteemat", 1) parentPages["stamps"] = parsers_common.list_add_or_assign(parentPages["stamps"], 0, "") parentPages["titles"] = parsers_common.list_add_or_assign(parentPages["titles"], 0, "Arutelud") parentPages["urls"] = parsers_common.list_add_or_assign(parentPages["urls"], 0, "https://arutelud.com/viewtopic.php?f=3&t=4&sd=d&sk=t&st=7") # remove unwanted content: titles dictList = [ "Race.Fi:", "Write my", "PÕLVAMAA, VÕRUMAA JA VALGAMAA CB JA HAM SIDE", ] parentPages = parsers_common.article_data_dict_clean(parentPages, dictList, "in", "titles") # teemade läbivaatamine for i in parsers_common.article_urls_range(parentPages["urls"]): # teemalehe sisu hankimine if parsers_common.should_get_article_body(i, maxArticleBodies): curParentUrl = parsers_common.get(parentPages["urls"], i) curParentUrl = curParentUrl.split("&sid=")[0] curParentUrl = curParentUrl + "&start=100000" parentPagesStamp = parsers_common.get(parentPages["stamps"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curParentUrl, cache='cacheStamped', pageStamp=parentPagesStamp) articlePostsDict = {} if version in (1, 2): rss_print.print_debug(__file__, "kasutame spetsiifilist hankimist, domain = " + domain, 2) articlePostsDict["authors"] = parsers_common.xpath_to("list", pageTree, '//tr/td/b[@class="postauthor"]/text()') articlePostsDict["descriptions"] = parsers_common.xpath_to("list", pageTree, '//tr/td/div[@class="postbody"][1]', parent=True) articlePostsDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//tr/td[@class="gensmall"]/div[@style="float: right;"]/text()') articlePostsDict["urls"] = parsers_common.xpath_to("list", pageTree, '//tr/td[@class="gensmall"]/div[@style="float: right;"]/a/@href') else: rss_print.print_debug(__file__, "kasutame üldist hankimist, domain = " + domain, 3) articlePostsDict["authors"] = parsers_common.xpath_to("list", pageTree, '//p[@class="author"]//strong//text()') articlePostsDict["descriptions"] = parsers_common.xpath_to("list", pageTree, '//div[@class="content"]', parent=True) articlePostsDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/time/@datetime') articlePostsDict["urls"] = parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/a/@href') if not articlePostsDict["pubDates"]: articlePostsDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/time/text()') if not articlePostsDict["pubDates"]: articlePostsDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/text()[1]') if len(articlePostsDict["pubDates"][0]) < 5: rss_print.print_debug(__file__, "hangitud aeg[0] liiga lühike: '" + articlePostsDict["pubDates"][0] + "', proovime alternatiivi...", 0) articlePostsDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/text()[2]') if len(articlePostsDict["pubDates"][0]) < 5: rss_print.print_debug(__file__, "hangitud aeg[0] liiga lühike: '" + articlePostsDict["pubDates"][0] + "'", 0) else: rss_print.print_debug(__file__, "hangitud aeg[0]: '" + articlePostsDict["pubDates"][0] + "'", 4) if not articlePostsDict["pubDates"]: rss_print.print_debug(__file__, "ei suutnud hankida ühtegi aega", 0) # teema postituste läbivaatamine for j in parsers_common.article_posts_range(articlePostsDict["urls"], maxArticlePosts): # author articleDataDict["authors"] = parsers_common.list_add(articleDataDict["authors"], j, parsers_common.get(articlePostsDict["authors"], j)) # description curArtDesc = parsers_common.get(articlePostsDict["descriptions"], j) curArtDesc = curArtDesc.replace('</div><div class="quotecontent">', '<br>') curArtDesc = parsers_common.fix_quatation_tags(curArtDesc, '<div class="quotetitle">', "</div>", "<blockquote>", "</blockquote>") articleDataDict["descriptions"] = parsers_common.list_add(articleDataDict["descriptions"], j, curArtDesc) # pubDates curArtPubDate = parsers_common.get(articlePostsDict["pubDates"], j) curArtPubDate = parsers_datetime.months_to_int(curArtPubDate) curArtPubDate = parsers_datetime.remove_weekday_strings(curArtPubDate) curArtPubDate = parsers_datetime.replace_string_with_timeformat(curArtPubDate, "eile", "%d %m %Y", offsetDays=-1) curArtPubDate = parsers_datetime.replace_string_with_timeformat(curArtPubDate, "täna", "%d %m %Y", offsetDays=0) curArtPubDate = parsers_datetime.guess_datetime(curArtPubDate) articleDataDict["pubDates"] = parsers_common.list_add(articleDataDict["pubDates"], j, curArtPubDate) # title curArtTitle = parsers_common.get(parentPages["titles"], i) curArtTitle = parsers_common.str_title_at_domain(curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add(articleDataDict["titles"], j, curArtTitle) # url curArtUrl = parsers_common.get(articlePostsDict["urls"], j) curArtUrl = parsers_common.link_add_end(curArtUrl, articlePostsDict["urls"][j]) articleDataDict["urls"] = parsers_common.list_add(articleDataDict["urls"], j, curArtUrl) rss_print.print_debug(__file__, "teema " + str(i + 1) + " postitus nr. " + str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) + ") on " + articlePostsDict["urls"][j], 2) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//main/div[@class="content"]/div/div/div[1]/div/div', parent=True) articleDataDict["images"] = parsers_common.xpath_to( "list", pageTree, '//main/div[1]/div/div/div[1]/div/div/a/div/img/@src') articleDataDict["titles"] = parsers_common.xpath_to( "list", pageTree, '//main/div[@class="content"]/div/div/div[1]/div/div/div/div[1]/div[2]/a/text()' ) articleDataDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//main/div[@class="content"]/div/div/div[1]/div/div/div/div[1]/div[2]/a/@href' ) # remove unwanted content: titles dictList = [ "(uus) raamat", "abramova", "akvavit", "based broccoli", "beats of no nation", "bisweed", "ekkm", "error!", "floorshow", "gnoom", "hard feeler", "hillbilly picnic", "ida jutud", "ida räpp", "intro", "katus", "keskkonnatund", "kink konk", "korrosioon", "kräpp", "let me juke", "liin ", "liin", "lunchbreak lunchdate", "meie igapäevane avalik ruum", "milk", "muster", "myös", "müürilehe hommik", "n-lib" "oleneb päevast!", "oujee!", "paneel", "playa music", "propel", "puhkus", "rets records", "room_202", "rõhk", "saal raadio", "soojad suhted", "svet nureka", "söökladisko", "triinemets.", "vitamiin k", "zubrovka am", "ära kaaguta!", "öömaja", ] articleDataDict = parsers_common.article_data_dict_clean( articleDataDict, dictList, "in", "titles") # remove unwanted content: descriptions dictList = [ "#hip hop", "#interview", "#rap", ] articleDataDict = parsers_common.article_data_dict_clean( articleDataDict, dictList, "in", "descriptions") for i in parsers_common.article_urls_range(articleDataDict["urls"]): # title curArtTitle = parsers_common.get(articleDataDict["titles"], i) curArtTitle = parsers_common.str_title_at_domain(curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add_or_assign( articleDataDict["titles"], i, curArtTitle) return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 1) maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX / maxArticleBodies) # set 0 for all posts parentPages = {} parentPages["stamps"] = parsers_common.xpath_to( "list", pageTree, '//tbody/tr/th[@class="col-1"]/text()') parentPages["titles"] = parsers_common.xpath_to( "list", pageTree, '//tbody/tr/th[@class="col-7 teemapealkiri"]/a/text()') parentPages["urls"] = parsers_common.xpath_to( "list", pageTree, '//tbody/tr/th[@class="col-4"]/a/@href') # remove unwanted content: titles dictList = [ "Lõvide perekonna uus teema", "abort", "beebi", "ivf", "lapse", "rase ", "rased", "triibupüüdjad", ] parentPages = parsers_common.article_data_dict_clean( parentPages, dictList, "in", "titles") # teemade läbivaatamine for i in parsers_common.article_urls_range(parentPages["urls"]): # teemalehe sisu hankimine if parsers_common.should_get_article_body(i, maxArticleBodies): curParentUrl = parsers_common.get(parentPages["urls"], i) curParentUrl = curParentUrl.split("/#")[0] pageTree = parsers_common.get_article_tree( domain, curParentUrl, cache='cacheStamped', pageStamp=parentPages["stamps"][i]) articlePostsDict = {} articlePostsDict["descriptions"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="bbp-reply-content entry-content"]', parent=True) articlePostsDict["pubDates"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="post_date date updated"]/text()') articlePostsDict["urls"] = parsers_common.xpath_to( "list", pageTree, '//div[@class="bbp-reply-header entry-title"]/@id') # teema postituste läbivaatamine for j in parsers_common.article_posts_range( articlePostsDict["urls"], maxArticlePosts): # description curArtDesc = parsers_common.get( articlePostsDict["descriptions"], j) curArtDesc = curArtDesc.split( '<div class="gdrts-rating-block ')[0] curArtDesc = parsers_html.html_remove_single_parents( curArtDesc) articleDataDict["descriptions"] = parsers_common.list_add( articleDataDict["descriptions"], j, curArtDesc) # pubDates curArtPubDate = parsers_common.get( articlePostsDict["pubDates"], j) curArtPubDate = parsers_datetime.guess_datetime(curArtPubDate) articleDataDict["pubDates"] = parsers_common.list_add( articleDataDict["pubDates"], j, curArtPubDate) # title curArtTitle = parsers_common.get(parentPages["titles"], i) curArtTitle = parsers_common.str_title_at_domain( curArtTitle, domain) articleDataDict["titles"] = parsers_common.list_add( articleDataDict["titles"], j, curArtTitle) # url curArtUrl = parsers_common.get( parentPages["urls"], i) + "/#" + articlePostsDict["urls"][j] articleDataDict["urls"] = parsers_common.list_add( articleDataDict["urls"], j, curArtUrl) rss_print.print_debug( __file__, "teema " + str(i + 1) + " postitus nr. " + str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) + ") on " + articlePostsDict["urls"][j], 2) # remove unwanted content: descriptions dictList = [ " liba " "Kommentaar eemaldatud.", "Liba?", ] articleDataDict = parsers_common.article_data_dict_clean( articleDataDict, dictList, "in", "descriptions") return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain): articleDataDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//div[@class="article-content"]/div[@class="article-content__meta"]/span[@class="article-content__date-published"]/text()') articleDataDict["titles"] = parsers_common.xpath_to("list", pageTree, '//div[@class="article-content"]/a[@class="article-content__headline"]/text()') articleDataDict["urls"] = parsers_common.xpath_to("list", pageTree, '//div[@class="article-content"]/a[@class="article-content__headline"]/@href') articleDataDictPubDatesDay = parsers_common.xpath_to("list", pageTree, '//div[@class="article-content"]/div[@class="article-content__meta"]/span[@class="article-content__date-published"]/span/text()') # remove unwanted content: titles dictList = [ "Sakala kuulutused", "Tartu Börs,", "positiivseid proove", ] articleDataDict = parsers_common.article_data_dict_clean(articleDataDict, dictList, "in", "titles") for i in parsers_common.article_urls_range(articleDataDict["urls"]): # pubDates magic from "24.12.2017 17:51" to datetime() curArtPubDateDay = "" if len(articleDataDictPubDatesDay) - 1 >= i: curArtPubDateDay = parsers_common.get(articleDataDictPubDatesDay, i) curArtPubDateDay = parsers_datetime.replace_string_with_timeformat(curArtPubDateDay, "Eile", "%d.%m.%Y", offsetDays=-1) curArtPubDateDay = parsers_datetime.replace_string_with_timeformat(curArtPubDateDay, "Täna", "%d.%m.%Y", offsetDays=0) curArtPubDate = articleDataDict["pubDates"][i] curArtPubDate = parsers_datetime.raw_to_datetime(curArtPubDateDay + curArtPubDate, "%d.%m.%Y, %H:%M") articleDataDict["pubDates"][i] = curArtPubDate if parsers_common.should_get_article_body(i): curArtUrl = parsers_common.get(articleDataDict["urls"], i) # load article into tree pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll') # author curArtAuthor = parsers_common.xpath_to("single", pageTree, '//span[@class="article-authors__name"]/text()', multi=True) articleDataDict["authors"] = parsers_common.list_add_or_assign(articleDataDict["authors"], i, curArtAuthor) # description1 - enne pilti curArtDesc1 = "" if not curArtDesc1: curArtDesc1 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--video"][1]', parent=True, count=True) if not curArtDesc1: curArtDesc1 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--articleBullets"]', parent=True, count=True) # description2 - pildi ja kuulamise vahel curArtDesc2 = "" if not curArtDesc2: curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//div[@itemprop="articleBody"]', parent=True, count=True) if not curArtDesc2: curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--htmlElement article-body__item--lead"]', parent=True, count=True, multi=True) if not curArtDesc2: curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//span[@class="figure__caption--title"][1]', parent=True, count=True) if not curArtDesc2: curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--htmlElement article-body--first-child article-body__item--lead"]', parent=True, count=True) if not curArtDesc2: curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//div[@itemprop="description"]', parent=True, count=True) # description3 - pärast kuulamist curArtDesc3 = "" if not curArtDesc3: curArtDesc3 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--htmlElement"]', parent=True, count=True, multi=True) if not curArtDesc3: curArtDesc3 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--premium-indicator"]', parent=True, count=True) # description4 - hall väljajuhatus curArtDesc4 = "" if not curArtDesc4: curArtDesc4 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--htmlElement article-body--teaser"]', parent=True, count=True) if not curArtDesc4: curArtDesc4 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--gallery"]', parent=True, count=True, multi=True) # image curArtImg = "" if not curArtImg: curArtImg = parsers_common.xpath_to("single", pageTree, '//div[@class="article-superheader article-superheader--figure"]/div[@class="article-superheader__background"]/@style', count=True) curArtImg = curArtImg.split("url('")[-1].strip("');") if not curArtImg: curArtImg = parsers_common.xpath_to("single", pageTree, '//figure[@class="figure"]/img[@class="figure--has-fullscreen"]/@src', count=True) if not curArtImg: curArtImg = parsers_common.xpath_to("single", pageTree, '//meta[@property="og:image"]/@content', count=True) # kontrollid if "-kuulutused-" in curArtUrl: rss_print.print_debug(__file__, "ei kontrolli plokke, kuna: kuulutused", 2) elif "-karikatuur" in curArtUrl: rss_print.print_debug(__file__, "ei kontrolli plokke, kuna: karikatuur", 2) else: if not curArtDesc1: rss_print.print_debug(__file__, "1. plokk on tühi. (Pildieelne loendiplokk puudub?)", 2) else: rss_print.print_debug(__file__, "curArtDesc1 = " + curArtDesc1, 4) if not curArtDesc2: rss_print.print_debug(__file__, "2. plokk on tühi. (Pildi ja kuulamise vahe plokk puudub?) - " + curArtUrl, 0) else: rss_print.print_debug(__file__, "curArtDesc2 = " + curArtDesc2, 4) if not curArtDesc3: rss_print.print_debug(__file__, "3. plokk on tühi. (Pärast kuulamist plokk puudub?)", 0) else: rss_print.print_debug(__file__, "curArtDesc3 = " + curArtDesc3, 4) if not curArtDesc4: if "button--for-subscription" in curArtDesc3: curArtDesc3 = curArtDesc3.split('<span class="button--for-subscription')[0] rss_print.print_debug(__file__, "4. plokk on tühi. (Kolmandas plokis oli teemant)", 3) else: rss_print.print_debug(__file__, "4. plokk on tühi. (Hall fadeout plokk puudub?)", 2) else: rss_print.print_debug(__file__, "curArtDesc4 = " + curArtDesc4, 4) if not curArtImg: rss_print.print_debug(__file__, "pilti ei leitud.", 0) else: rss_print.print_debug(__file__, "curArtImg = " + curArtImg, 4) if curArtDesc1 and curArtDesc1 == curArtDesc2: rss_print.print_debug(__file__, "1. ja 2. plokk langevad kokku", 0) rss_print.print_debug(__file__, "curArtDesc1 = " + curArtDesc1, 1) rss_print.print_debug(__file__, "curArtDesc2 = " + curArtDesc2, 1) if curArtDesc2 and curArtDesc2 == curArtDesc3: rss_print.print_debug(__file__, "2. ja 3. plokk langevad kokku", 0) rss_print.print_debug(__file__, "curArtDesc2 = " + curArtDesc2, 1) rss_print.print_debug(__file__, "curArtDesc3 = " + curArtDesc3, 1) if curArtDesc3 and curArtDesc3 == curArtDesc4: rss_print.print_debug(__file__, "3. ja 4. plokk langevad kokku", 0) rss_print.print_debug(__file__, "curArtDesc3 = " + curArtDesc3, 1) rss_print.print_debug(__file__, "curArtDesc4 = " + curArtDesc4, 1) if curArtDesc4 and curArtDesc4 == curArtDesc1: rss_print.print_debug(__file__, "4. ja 1. plokk langevad kokku", 0) rss_print.print_debug(__file__, "curArtDesc4 = " + curArtDesc3, 1) rss_print.print_debug(__file__, "curArtDesc1 = " + curArtDesc4, 1) curArtDesc = curArtDesc1 + ' ' + curArtDesc2 + ' ' + curArtDesc3 + ' ' + curArtDesc4 if "button--for-subscription" in curArtDesc: curArtDesc = curArtDesc.replace(' tellijatele', '') curArtDesc = curArtDesc.replace('<a href="https://minumeedia.postimees.ee/kampaania/" target="_blank" class="my-media-link">digipaketi</a>', '') curArtDesc = curArtDesc.replace('<div class="article-body__item article-body__item--audio-teaser">', '<div>') curArtDesc = curArtDesc.replace('<div class="audio-teaser">', '<div>') curArtDesc = curArtDesc.replace('<img data-lazy-src="/v5/img/icons/diamond-black-on-yellow.svg" alt="Tellijale" src="/v5/img/icons/diamond-black-on-yellow.svg" width="30" height="30">', "") curArtDesc = curArtDesc.replace('<img src="/v5/img/icons/diamond-black-on-yellow.svg" alt="Tellijale" width="30" height="30">', "") curArtDesc = curArtDesc.replace('<span class="button--for-subscription">', "<span>") curArtDesc = curArtDesc.replace('<span class="button--for-subscription__diamond diamond--ee">', "<span>") curArtDesc = curArtDesc.replace('<span class="button--for-subscription__text"', "") curArtDesc = curArtDesc.replace('Artikkel on kuulatav', '') curArtDesc = curArtDesc.replace('Tellijale', '') articleDataDict["descriptions"] = parsers_common.list_add_or_assign(articleDataDict["descriptions"], i, curArtDesc) articleDataDict["images"] = parsers_common.list_add_or_assign(articleDataDict["images"], i, curArtImg) return articleDataDict