def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["descriptions"] = parsers_common.xpath_to(
        "list", pageTree, '//div[@id="t-content"]/table[1]/tr', parent=True)
    articleDataDict["pubDates"] = parsers_common.xpath_to(
        "list", pageTree, '//div[@id="t-content"]/table[1]/tr/td[1]/text()')
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@id="t-content"]/table[1]/tr/td[@class="left"]/b/a/text()')
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@id="t-content"]/table[1]/tr/td[@class="left"]/b/a/@href')

    # remove unwanted content: descriptions
    dictList = ["Tartu"]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "not in", "descriptions")

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        # pubDates magic from "29.08.19" to datetime()
        curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i)
        curArtPubDate = parsers_datetime.raw_to_datetime(
            curArtPubDate, "%d.%m.%y")
        articleDataDict["pubDates"] = parsers_common.list_add_or_assign(
            articleDataDict["pubDates"], i, curArtPubDate)

    return articleDataDict
Exemple #2
0
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["descriptions"] = parsers_common.xpath_to(
        "list",
        pageTree,
        '//ul[@class="news-list list"]/li/div[@class="inner"]/a[@class="news-list-link"]/div/p',
        parent=True)
    articleDataDict["images"] = parsers_common.xpath_to(
        "list", pageTree,
        '//ul[@class="news-list list"]/li/div[@class="inner"]/a[@class="news-list-link"]/p/img/@src'
    )
    articleDataDict["pubDates"] = parsers_common.xpath_to(
        "list", pageTree,
        '//ul[@class="news-list list"]/li/div[@class="inner"]/a[@class="news-list-link"]/@title'
    )
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//ul[@class="news-list list"]/li/div[@class="inner"]/a[@class="news-list-link"]/div/h3/text()'
    )
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//ul[@class="news-list list"]/li/div[@class="inner"]/a[@class="news-list-link"]/@href'
    )

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        # pubDates magic from "03.01.2018 11:09.08 [Tanel]" to datetime()
        curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i)
        curArtPubDate = curArtPubDate.split('[')[0]
        curArtPubDate = parsers_datetime.months_to_int(curArtPubDate)
        curArtPubDate = parsers_datetime.raw_to_datetime(
            curArtPubDate, "%d. %m %Y %H:%M:%S")
        articleDataDict["pubDates"] = parsers_common.list_add_or_assign(
            articleDataDict["pubDates"], i, curArtPubDate)

    return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["authors"] = parsers_common.xpath_to(
        "list",
        pageTree,
        '//div[@class="message"]/div[@class="name"]',
        parent=True)
    articleDataDict["descriptions"] = parsers_common.xpath_to(
        "list",
        pageTree,
        '//div[@class="message"]/div[@class="content"]',
        parent=True)
    articleDataDict["pubDates"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="message"]/div[@class="posttime"]/text()')
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="message"]/div[@class="title"]/a[3]/text()')
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="message"]/div[@class="title"]/a[3]/@href')

    # remove unwanted content: titles
    dictList = [
        "Hoiatus! Läbisõit 100% keritud",
        "Kaebused",
    ]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "in", "titles")

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        # description
        curArtDesc = parsers_common.get(articleDataDict["descriptions"], i)
        curArtDesc = curArtDesc.split('<div class="userControls')[0]
        articleDataDict["descriptions"] = parsers_common.list_add_or_assign(
            articleDataDict["descriptions"], i, curArtDesc)

        # title
        curArtTitle = parsers_common.get(articleDataDict["titles"], i)
        curArtTitle = parsers_common.str_lchop(curArtTitle, "Re: ")
        curArtTitle = parsers_common.str_title_at_domain(curArtTitle, domain)
        articleDataDict["titles"] = parsers_common.list_add_or_assign(
            articleDataDict["titles"], i, curArtTitle)

        # pubDates magic from "20:22 01.09.2019" to Datetime()
        curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i)
        curArtPubDate = parsers_datetime.replace_string_with_timeformat(
            curArtPubDate, "eile", "%d.%m.%Y", offsetDays=-1)
        curArtPubDate = parsers_datetime.replace_string_with_timeformat(
            curArtPubDate, "täna", "%d.%m.%Y", offsetDays=0)
        curArtPubDate = parsers_datetime.add_missing_date_to_string(
            curArtPubDate, "%H:%M %d.%m.%Y", " %d.%m.%Y")
        curArtPubDate = parsers_datetime.raw_to_datetime(
            curArtPubDate, "%H:%M %d.%m.%Y")
        articleDataDict["pubDates"] = parsers_common.list_add_or_assign(
            articleDataDict["pubDates"], i, curArtPubDate)

    return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//body/div[2]/div[1]/main/div/div[3]/div/div/a/h2/text()')
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//body/div[2]/div[1]/main/div/div[3]/div/div/a/@href')

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        if parsers_common.should_get_article_body(i):
            curArtUrl = parsers_common.get(articleDataDict["urls"], i)

            # load article into tree
            pageTree = parsers_common.get_article_tree(domain,
                                                       curArtUrl,
                                                       cache='cacheAll')

            # description
            curArtDesc = parsers_common.xpath_to("single",
                                                 pageTree,
                                                 '//div[@class="col-md-12"]',
                                                 parent=True)
            if not curArtDesc:
                curArtDesc = parsers_common.xpath_to(
                    "single",
                    pageTree,
                    '//div[@class="col-md-8"]',
                    parent=True)
            if not curArtDesc:
                curArtDesc = parsers_common.xpath_to(
                    "single",
                    pageTree,
                    '//div[@class="img-open-area basic-content pb-4"]',
                    parent=True)
            articleDataDict[
                "descriptions"] = parsers_common.list_add_or_assign(
                    articleDataDict["descriptions"], i, curArtDesc)

            # image
            curArtImg = parsers_common.xpath_to(
                "single", pageTree, '//a[@data-lightbox="treimages"][1]/@href')
            articleDataDict["images"] = parsers_common.list_add_or_assign(
                articleDataDict["images"], i, curArtImg)

            # pubDates magic from "7. märts 2021" to datetime()
            curArtPubDate = parsers_common.xpath_to(
                "single", pageTree,
                '//div[@class="col-sm-12 col-md-auto text-sm-center text-md-right"]/div[@class="mt-1 text-uppercase"]/small/text()'
            )
            curArtPubDate = parsers_datetime.months_to_int(curArtPubDate)
            curArtPubDate = parsers_datetime.raw_to_datetime(
                curArtPubDate, "%d. %m %Y")
            articleDataDict["pubDates"] = parsers_common.list_add_or_assign(
                articleDataDict["pubDates"], i, curArtPubDate)

    return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["descriptions"] = parsers_common.xpath_to(
        "list",
        pageTree,
        '/html/body/div[3]/div/div[1]/div[@class="sb-article"]/a/div[@class="sb-article-cnt"]/div[@class="sb-article-prolog"]',
        parent=True)
    articleDataDict["images"] = parsers_common.xpath_to(
        "list", pageTree,
        '/html/body/div[3]/div/div[1]/div[@class="sb-article"]/a/div[@class="sb-article-image"]/@style'
    )
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '/html/body/div[3]/div/div[1]/div[@class="sb-article"]/a/div[@class="sb-article-cnt"]/div[@class="sb-article-title"]/h3/text()'
    )
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '/html/body/div[3]/div/div[1]/div[@class="sb-article"]/a/@href')

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        if parsers_common.should_get_article_body(i):
            curArtUrl = parsers_common.get(articleDataDict["urls"], i)

            # load article into tree
            pageTree = parsers_common.get_article_tree(domain,
                                                       curArtUrl,
                                                       cache='cacheAll')

            # author
            curArtAuthor = parsers_common.xpath_to(
                "single", pageTree,
                '//div[@class="sg-article-details"]/div[@class="author"]/text()'
            )
            articleDataDict["authors"] = parsers_common.list_add_or_assign(
                articleDataDict["authors"], i, curArtAuthor)

            # description
            curArtDesc = parsers_common.xpath_to(
                "single",
                pageTree,
                '/html/body/div[3]/div/div[@class="page-content"]/div[@class="sg-article"]/div[@class="sg-article-text"]',
                parent=True)
            articleDataDict[
                "descriptions"] = parsers_common.list_add_or_assign(
                    articleDataDict["descriptions"], i, curArtDesc)

            # pubDates magic from "18.08.2019 21:35" to datetime()
            curArtPubDate = parsers_common.xpath_to(
                "single", pageTree,
                '//div[@class="sg-article-details"]/div[@class="date"]/text()')
            curArtPubDate = parsers_datetime.raw_to_datetime(
                curArtPubDate, "%d.%m.%Y %H:%M")
            articleDataDict["pubDates"] = parsers_common.list_add_or_assign(
                articleDataDict["pubDates"], i, curArtPubDate)

    return articleDataDict
Exemple #6
0
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["descriptions"] = parsers_common.xpath_to(
        "list",
        pageTree,
        '//div[@class="applied-jobs"]/div/div[@class="job-content"]/div[@class="left-content"]/p',
        parent=True)
    articleDataDict["pubDates"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="applied-jobs"]/div/div[@class="job-content"]/div[@class="right-content"]/div[@class="application-date"][1]/text()'
    )
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="applied-jobs"]/div/div[@class="job-content"]/div[@class="left-content"]/h2/a/text()'
    )
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="applied-jobs"]/div/div[@class="job-content"]/div[@class="left-content"]/h2/a/@href'
    )

    # remove unwanted content: descriptions
    dictList = [
        "jurist",
        "logopeed",
        "pedagoog",
        "psühholoog",
        "raamatupidaja",
        "sotsiaal",
        "õpetaja",
    ]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "in", "titles")

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        # pubDates magic from "Avaldatud: 12.12.2017" to datetime()
        curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i)
        curArtPubDate = curArtPubDate.split(': ')[1]
        curArtPubDate = parsers_datetime.raw_to_datetime(
            curArtPubDate, "%d.%m.%Y")
        articleDataDict["pubDates"] = parsers_common.list_add_or_assign(
            articleDataDict["pubDates"], i, curArtPubDate)

        # url
        curArtUrl = parsers_common.get(articleDataDict["urls"], i)
        curArtUrl = curArtUrl.split('?ref=')[0]
        articleDataDict["urls"] = parsers_common.list_add_or_assign(
            articleDataDict["urls"], i, curArtUrl)

    return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["authors"] = parsers_common.xpath_to(
        "list", pageTree, '//section/article/h2/a[2]/text()')
    articleDataDict["pubDates"] = parsers_common.xpath_to(
        "list", pageTree, '//section/article/time/text()')
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree, '//section/article/h2/a[@itemprop="url"]/text()')
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree, '//section/article/h2/a[@itemprop="url"]/@href')

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        # pubDates magic from "2021-01-06T10:11:04Z" to datetime()
        curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i)
        curArtPubDate = parsers_datetime.raw_to_datetime(
            curArtPubDate, "%Y-%m-%dt%H:%M:%S%z")
        articleDataDict["pubDates"][i] = curArtPubDate

        if parsers_common.should_get_article_body(i):
            curArtUrl = parsers_common.get(articleDataDict["urls"], i)

            # load article into tree
            pageTree = parsers_common.get_article_tree(domain,
                                                       curArtUrl,
                                                       cache="cacheAll")

            # description
            curArtDesc = parsers_common.xpath_to("single",
                                                 pageTree,
                                                 '//body/div//article',
                                                 parent=True)
            articleDataDict[
                "descriptions"] = parsers_common.list_add_or_assign(
                    articleDataDict["descriptions"], i, curArtDesc)

            # image
            curArtImg = parsers_common.xpath_to(
                "single", pageTree, '//body/div//article/p/img/@src')
            articleDataDict["images"] = parsers_common.list_add_or_assign(
                articleDataDict["images"], i, curArtImg)

    return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["images"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="col-sm-6"]/div[@class="post-item"]/a/div/img/@src')
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="col-sm-6"]/div[@class="post-item"]/a/h3/text()')
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="col-sm-6"]/div[@class="post-item"]/a/@href')

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        if parsers_common.should_get_article_body(i):
            curArtUrl = parsers_common.get(articleDataDict["urls"], i)

            # load article into tree
            pageTree = parsers_common.get_article_tree(domain,
                                                       curArtUrl,
                                                       cache='cacheAll')

            # descriptions
            curArtDesc = parsers_common.xpath_to("single",
                                                 pageTree,
                                                 '//div[@class="col-sm-9"]/p',
                                                 multi=True)
            articleDataDict[
                "descriptions"] = parsers_common.list_add_or_assign(
                    articleDataDict["descriptions"], i, curArtDesc)

            # timeformat magic from "Avaldatud: 14 detsember, 2017" to datetime()
            curArtPubDate = parsers_common.xpath_to(
                "single", pageTree,
                '//div[@class="col-sm-9"]/div[@class="page-header"]/em/text()')
            curArtPubDate = parsers_datetime.months_to_int(
                curArtPubDate.split(':')[1])
            curArtPubDate = parsers_datetime.raw_to_datetime(
                curArtPubDate, "%d %m, %Y")
            articleDataDict["pubDates"] = parsers_common.list_add_or_assign(
                articleDataDict["pubDates"], i, curArtPubDate)

    return articleDataDict
Exemple #9
0
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["pubDates"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="js-newsline-container"]/span[1]/text()')
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree, '//div[@class="js-newsline-container"]/div/a/text()')
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree, '//div[@class="js-newsline-container"]/div/a/@href')

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        # pubDates magic from "14 dets  2017 11:34" to datetime()
        curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i)
        curArtPubDate = parsers_datetime.months_to_int(curArtPubDate)
        curArtPubDate = parsers_datetime.raw_to_datetime(
            curArtPubDate, "%d %m %Y %H:%M")
        articleDataDict["pubDates"] = parsers_common.list_add_or_assign(
            articleDataDict["pubDates"], i, curArtPubDate)

        if parsers_common.should_get_article_body(i):
            curArtUrl = parsers_common.get(articleDataDict["urls"], i)

            # load article into tree
            pageTree = parsers_common.get_article_tree(domain,
                                                       curArtUrl,
                                                       cache='cacheAll')

            # description
            curArtDesc = parsers_common.xpath_to(
                "single", pageTree, '//div[@class="news-preview"]/div/text()')
            if not curArtDesc:
                curArtDesc = parsers_common.xpath_to(
                    "single",
                    pageTree,
                    '//div[@class="content_item"]',
                    parent=True)
            articleDataDict[
                "descriptions"] = parsers_common.list_add_or_assign(
                    articleDataDict["descriptions"], i, curArtDesc)

    return articleDataDict
Exemple #10
0
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["images"] = parsers_common.xpath_to(
        "list", pageTree, '//article/a/div/div/img/@src')
    articleDataDict["pubDates"] = parsers_common.xpath_to(
        "list", pageTree,
        '//article/a/div[@class="node__body"]/p[@class="node__date"]/span/@content'
    )
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//article/a/div[@class="node__body"]/h3/span/text()')
    articleDataDict["urls"] = parsers_common.xpath_to("list", pageTree,
                                                      '//article/a/@href')

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        # pubDates magic from "2021-03-23T12:35:36+00:00" to datetime()
        curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i)
        curArtPubDate = parsers_datetime.raw_to_datetime(
            curArtPubDate, "%Y-%m-%dt%H:%M:%S%z")
        articleDataDict["pubDates"] = parsers_common.list_add_or_assign(
            articleDataDict["pubDates"], i, curArtPubDate)

        if parsers_common.should_get_article_body(i):
            curArtUrl = parsers_common.get(articleDataDict["urls"], i)

            # load article into tree
            pageTree = parsers_common.get_article_tree(domain,
                                                       curArtUrl,
                                                       cache='cacheAll')

            # description
            curArtDesc = parsers_common.xpath_to(
                "single",
                pageTree,
                '//div[@class="node__content"]/div/div[@class="field__item"]',
                parent=True)
            articleDataDict[
                "descriptions"] = parsers_common.list_add_or_assign(
                    articleDataDict["descriptions"], i, curArtDesc)

    return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["authors"] = parsers_common.xpath_to(
        "list", pageTree,
        '//section/div/div/div/div/div/article/div/div/div/div[2]/div[2]/div[1]/div/div/div[1]/div[1]/a/div/div[1]/div[1]/span/span/text()'
    )
    articleDataDict["descriptions"] = parsers_common.xpath_to(
        "list",
        pageTree,
        '//section/div/div/div/div/div/article/div/div/div/div[2]/div[2]/div[2]/div[1]/div/span[1]',
        parent=True)
    articleDataDict["pubDates"] = parsers_common.xpath_to(
        "list", pageTree,
        '//section/div/div/div/div/div/article/div/div/div/div[2]/div[2]/div[1]/div/div/div[1]/a/time/@datetime'
    )
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//section/div/div/div/div/div/article/div/div/div/div[2]/div[2]/div[1]/div/div/div[1]/div[1]/a/div/div[1]/div[1]/span/span/text()'
    )
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//section/div/div/div/div/div/article/div/div/div/div[2]/div[2]/div[1]/div/div/div[1]/a/@href'
    )

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        # pubDates 2021-02-12T16:08:02.000Z
        curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i)
        curArtPubDate = parsers_datetime.raw_to_datetime(
            curArtPubDate, "%Y-%m-%dT%H:%M:%S.000Z")
        articleDataDict["pubDates"] = parsers_common.list_add_or_assign(
            articleDataDict["pubDates"], i, curArtPubDate)

        # title
        curArtTitle = parsers_common.get(articleDataDict["titles"], i)
        curArtTitle = parsers_common.str_title_at_domain(curArtTitle, domain)
        articleDataDict["titles"] = parsers_common.list_add_or_assign(
            articleDataDict["titles"], i, curArtTitle)

    return articleDataDict
Exemple #12
0
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["pubDates"] =   parsers_common.xpath_to("list", pageTree, '//li[@class="b-posts__list-item"]/p[@class="b-posts__list-item-summary"]/text()')
    articleDataDict["titles"] =     parsers_common.xpath_to("list", pageTree, '//li[@class="b-posts__list-item"]/h2[@class="b-posts__list-item-title"]/a/text()')
    articleDataDict["urls"] =       parsers_common.xpath_to("list", pageTree, '//li[@class="b-posts__list-item"]/h2[@class="b-posts__list-item-title"]/a/@href')

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        # pubDates magic from "30.01.2021" to datetime()
        curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i)
        curArtPubDate = parsers_datetime.raw_to_datetime(curArtPubDate, "%d.%m.%Y")
        articleDataDict["pubDates"] = parsers_common.list_add_or_assign(articleDataDict["pubDates"], i, curArtPubDate)

        if parsers_common.should_get_article_body(i):
            curArtUrl = parsers_common.get(articleDataDict["urls"], i)

            # load article into tree
            pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll')

            # description
            curArtDesc = parsers_common.xpath_to("single", pageTree, '//div[@class="b-article"]', parent=True)
            articleDataDict["descriptions"] = parsers_common.list_add_or_assign(articleDataDict["descriptions"], i, curArtDesc)

    return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain):

    maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 10)
    maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX /
                            maxArticleBodies)  # set 0 for all posts

    parentPages = {}
    parentPages["stamps"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="reply-count"]/span[@data-xf-init="tooltip"]/text()')
    parentPages["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@qid="thread-item-parent"]/div[@qid="thread-item"]/div/div[@class="structItem-title"]/a[@qid="thread-item-title"]/text()'
    )
    parentPages["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@qid="thread-item-parent"]/div[@qid="thread-item"]/div/div[@class="structItem-title"]/a[@qid="thread-item-title"]/@href'
    )

    # remove unwanted content: titles
    dictList = [
        "$",
        "*:::::::the official what did you do to you mkiv today thread::::::::*",
        "??",
        "Ask a Simple Question",
    ]
    parentPages = parsers_common.article_data_dict_clean(
        parentPages, dictList, "in", "titles")

    # teemade läbivaatamine
    for i in parsers_common.article_urls_range(parentPages["urls"]):
        # teemalehe sisu hankimine
        if parsers_common.should_get_article_body(i, maxArticleBodies):
            curParentUrl = parsers_common.get(parentPages["urls"], i)
            curParentUrl = curParentUrl + "page-1000"
            parentPagesStamp = parsers_common.get(parentPages["stamps"], i)
            # load article into tree
            pageTree = parsers_common.get_article_tree(
                domain,
                curParentUrl,
                cache='cacheStamped',
                pageStamp=parentPagesStamp)

            articlePostsDict = {}
            articlePostsDict["authors"] = parsers_common.xpath_to(
                "list", pageTree, '//h4[@qid="message-username"]//text()')
            articlePostsDict["descriptions"] = parsers_common.xpath_to(
                "list",
                pageTree,
                '//article/div[@class="bbWrapper"]',
                parent=True)
            articlePostsDict["pubDates"] = parsers_common.xpath_to(
                "list", pageTree,
                '//div[@class="message-attribution-main"]/a[@class="u-concealed"][2]/time/@datetime'
            )
            articlePostsDict["urls"] = parsers_common.xpath_to(
                "list", pageTree,
                '//div[@class="message-attribution-main"]/a[@class="u-concealed"][2]/@href'
            )

            # teema postituste läbivaatamine
            for j in parsers_common.article_posts_range(
                    articlePostsDict["urls"], maxArticlePosts):
                # author
                articleDataDict["authors"] = parsers_common.list_add(
                    articleDataDict["authors"], j,
                    parsers_common.get(articlePostsDict["authors"], j))

                # description
                curArtDesc = parsers_common.get(
                    articlePostsDict["descriptions"], j)
                articleDataDict["descriptions"] = parsers_common.list_add(
                    articleDataDict["descriptions"], j, curArtDesc)

                # pubDates
                curArtPubDate = parsers_common.get(
                    articlePostsDict["pubDates"], j)
                curArtPubDate = parsers_datetime.raw_to_datetime(
                    curArtPubDate,
                    "%Y-%m-%dT%H:%M:%S%z")  # 2021-01-28T16:15:42-0500
                articleDataDict["pubDates"] = parsers_common.list_add(
                    articleDataDict["pubDates"], j, curArtPubDate)

                # title
                curArtTitle = parsers_common.get(parentPages["titles"], i)
                curArtTitle = parsers_common.str_title_at_domain(
                    curArtTitle, domain)
                articleDataDict["titles"] = parsers_common.list_add(
                    articleDataDict["titles"], j, curArtTitle)

                # url
                curArtUrl = parsers_common.get(articlePostsDict["urls"], j)
                articleDataDict["urls"] = parsers_common.list_add(
                    articleDataDict["urls"], j, curArtUrl)

                rss_print.print_debug(
                    __file__, "teema " + str(i + 1) + " postitus nr. " +
                    str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) +
                    ") on " + articlePostsDict["urls"][j], 2)

    return articleDataDict
Exemple #14
0
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="grid-main--item "]/div/div[2]/div[1]/h6/a[1]/text()')
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="grid-main--item "]/div/div[2]/div[1]/h6/a[1]/@href')

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        # titles
        curArtTitle = parsers_common.get(articleDataDict["titles"], i)
        curArtTitle = parsers_common.str_remove_clickbait(curArtTitle)
        articleDataDict["titles"] = parsers_common.list_add_or_assign(
            articleDataDict["titles"], i, curArtTitle)

        if parsers_common.should_get_article_body(i):
            curArtUrl = parsers_common.get(articleDataDict["urls"], i)

            # load article into tree
            pageTree = parsers_common.get_article_tree(domain,
                                                       curArtUrl,
                                                       cache='cacheAll')

            # author
            curArtAuthor = parsers_common.xpath_to(
                "single", pageTree, '//span[@class="author"]/text()')
            articleDataDict["authors"] = parsers_common.list_add_or_assign(
                articleDataDict["authors"], i, curArtAuthor)

            # description
            curArtDesc1 = parsers_common.xpath_to(
                "single",
                pageTree,
                '//div[@class="page-layout--block"][1]/div[@class="page-layout--content"]/div[@class="page-layout--inner"]/div[@class="article-main--content article-main--excerpt formatted--content"]',
                parent=True)
            if not curArtDesc1:
                curArtDesc1 = parsers_common.xpath_to(
                    "single",
                    pageTree,
                    '//div[@class="page-layout--content"]/div[@class="page-layout--inner"]/div[@class="article-main--content article-main--excerpt formatted--content"]',
                    parent=True)

            curArtDesc2 = parsers_common.xpath_to(
                "single",
                pageTree,
                '//div[@class="page-layout--block"][2]/div[@class="page-layout--content"]/div[@class="page-layout--inner"]',
                parent=True,
                multi=True)

            curArtDesc = curArtDesc1 + curArtDesc2
            curArtDesc = curArtDesc.split("Edasi lugemiseks")[0]
            curArtDesc = curArtDesc.split("Jaga:")[0]
            curArtDesc = curArtDesc.split("Samal teemal")[0]
            articleDataDict[
                "descriptions"] = parsers_common.list_add_or_assign(
                    articleDataDict["descriptions"], i, curArtDesc)

            # image
            curArtImg = parsers_common.xpath_to(
                "single", pageTree,
                '//div[@class="page-layout--block"][1]//div[@class="image-gallery-image first-in-gallery"][1]/picture[1]/img[@class="article-image"]/@src'
            )
            if not curArtImg:
                curArtImg = parsers_common.xpath_to(
                    "single", pageTree,
                    '//div[@class="part"][1]/div/p/img/@src')
            articleDataDict["images"] = parsers_common.list_add_or_assign(
                articleDataDict["images"], i, curArtImg)

            # pubDates from "täna 16:53" to datetime()
            curArtPubDate = parsers_common.xpath_to(
                "single", pageTree, '//div[@class="details--inner"]/text()')
            curArtPubDate = curArtPubDate.split(",")[-1]
            curArtPubDate = parsers_datetime.replace_string_with_timeformat(
                curArtPubDate, "eile", "%d. %m %Y ", offsetDays=-1)
            curArtPubDate = parsers_datetime.replace_string_with_timeformat(
                curArtPubDate, "täna", "%d. %m %Y ", offsetDays=0)
            curArtPubDate = parsers_datetime.months_to_int(curArtPubDate)
            curArtPubDate = parsers_datetime.raw_to_datetime(
                curArtPubDate, "%d. %m %Y %H:%M")
            articleDataDict["pubDates"] = parsers_common.list_add_or_assign(
                articleDataDict["pubDates"], i, curArtPubDate)

    return articleDataDict
Exemple #15
0
def fill_article_dict(articleDataDict, pageTree, domain):

    maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 5)
    maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX /
                            maxArticleBodies)  # set 0 for all posts

    parentPages = {}
    parentPages["stamps"] = parsers_common.xpath_to(
        "list", pageTree,
        '//table[@class="grid zebra forum"]/tr/td[@class="meta"][4]/span/text()'
    )
    parentPages["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//table[@class="grid zebra forum"]/tr/td[@class="title"]/a/@title')
    parentPages["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//table[@class="grid zebra forum"]/tr/td[@class="title"]/a/@href')

    # remove unwanted content: titles
    dictList = [
        "Börsihai",
        "Cleveroni aktsiate ost/müük/oksjon",
        "Head uut aastat – prognoosid",
        "Keegi malet soovib mängida",
        "LHV Pank paremaks",
        "Uurimis- ja lõputööde küsimustikud",
    ]
    parentPages = parsers_common.article_data_dict_clean(
        parentPages, dictList, "in", "titles")

    # teemade läbivaatamine
    for i in parsers_common.article_urls_range(parentPages["urls"]):
        # teemalehe sisu hankimine
        if parsers_common.should_get_article_body(i, maxArticleBodies):
            # load article into tree
            pageTree = parsers_common.get_article_tree(
                domain,
                parentPages["urls"][i] +
                '?listEventId=jumpToPage&listEventParam=100&pagesOfMaxSize=true',
                cache='cacheStamped',
                pageStamp=parentPages["stamps"][i])

            articlePostsDict = {}
            articlePostsDict["authors"] = parsers_common.xpath_to(
                "list", pageTree,
                '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-header clear"]/p[@class="author"]/strong/a/text()'
            )
            articlePostsDict["descriptions"] = parsers_common.xpath_to(
                "list",
                pageTree,
                '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-content temporary-class"]',
                parent=True)
            articlePostsDict["pubDates"] = parsers_common.xpath_to(
                "list", pageTree,
                '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-header clear"]/div/p[@class="permalink"]/a/node()'
            )
            articlePostsDict["urls"] = parsers_common.xpath_to(
                "list", pageTree,
                '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-header clear"]/div/p[@class="permalink"]/a/@href'
            )

            # teema postituste läbivaatamine
            for j in parsers_common.article_posts_range(
                    articlePostsDict["urls"], maxArticlePosts):
                # author
                articleDataDict["authors"] = parsers_common.list_add(
                    articleDataDict["authors"], j,
                    parsers_common.get(articlePostsDict["authors"], j))

                # description
                articleDataDict["descriptions"] = parsers_common.list_add(
                    articleDataDict["descriptions"], j,
                    parsers_common.get(articlePostsDict["descriptions"], j))

                # pubDates magic from "15.01.2012 23:49" to datetime()
                curArtPubDate = parsers_common.get(
                    articlePostsDict["pubDates"], j)
                curArtPubDate = parsers_datetime.replace_string_with_timeformat(
                    curArtPubDate, "Eile", "%d.%m.%Y", offsetDays=-1)
                curArtPubDate = parsers_datetime.add_missing_date_to_string(
                    curArtPubDate, "%d.%m.%Y %H:%M", "%d.%m.%Y ")
                curArtPubDate = parsers_datetime.raw_to_datetime(
                    curArtPubDate, "%d.%m.%Y %H:%M")
                articleDataDict["pubDates"] = parsers_common.list_add(
                    articleDataDict["pubDates"], j, curArtPubDate)

                # title
                curArtTitle = parsers_common.get(parentPages["titles"], i)
                curArtTitle = parsers_common.str_title_at_domain(
                    curArtTitle, domain)
                articleDataDict["titles"] = parsers_common.list_add(
                    articleDataDict["titles"], j, curArtTitle)

                # url
                curArtUrl = parentPages["urls"][i] + articlePostsDict["urls"][j]
                articleDataDict["urls"] = parsers_common.list_add(
                    articleDataDict["urls"], j, curArtUrl)

                rss_print.print_debug(
                    __file__, "teema postitus nr. " + str(j + 1) + "/(" +
                    str(len(articlePostsDict["urls"])) + ") on " +
                    articlePostsDict["urls"][j], 2)

    return articleDataDict
Exemple #16
0
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//div[@class="article-content"]/div[@class="article-content__meta"]/span[@class="article-content__date-published"]/text()')
    articleDataDict["titles"] = parsers_common.xpath_to("list", pageTree, '//div[@class="article-content"]/a[@class="article-content__headline"]/text()')
    articleDataDict["urls"] = parsers_common.xpath_to("list", pageTree, '//div[@class="article-content"]/a[@class="article-content__headline"]/@href')

    articleDataDictPubDatesDay = parsers_common.xpath_to("list", pageTree, '//div[@class="article-content"]/div[@class="article-content__meta"]/span[@class="article-content__date-published"]/span/text()')

    # remove unwanted content: titles
    dictList = [
        "Sakala kuulutused",
        "Tartu Börs,",
        "positiivseid proove",
    ]
    articleDataDict = parsers_common.article_data_dict_clean(articleDataDict, dictList, "in", "titles")

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        # pubDates magic from "24.12.2017 17:51" to datetime()
        curArtPubDateDay = ""
        if len(articleDataDictPubDatesDay) - 1 >= i:
            curArtPubDateDay = parsers_common.get(articleDataDictPubDatesDay, i)
            curArtPubDateDay = parsers_datetime.replace_string_with_timeformat(curArtPubDateDay, "Eile", "%d.%m.%Y", offsetDays=-1)
            curArtPubDateDay = parsers_datetime.replace_string_with_timeformat(curArtPubDateDay, "Täna", "%d.%m.%Y", offsetDays=0)

        curArtPubDate = articleDataDict["pubDates"][i]
        curArtPubDate = parsers_datetime.raw_to_datetime(curArtPubDateDay + curArtPubDate, "%d.%m.%Y, %H:%M")
        articleDataDict["pubDates"][i] = curArtPubDate

        if parsers_common.should_get_article_body(i):
            curArtUrl = parsers_common.get(articleDataDict["urls"], i)

            # load article into tree
            pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll')

            # author
            curArtAuthor = parsers_common.xpath_to("single", pageTree, '//span[@class="article-authors__name"]/text()', multi=True)
            articleDataDict["authors"] = parsers_common.list_add_or_assign(articleDataDict["authors"], i, curArtAuthor)

            # description1 - enne pilti
            curArtDesc1 = ""
            if not curArtDesc1:
                curArtDesc1 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--video"][1]', parent=True, count=True)
            if not curArtDesc1:
                curArtDesc1 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--articleBullets"]', parent=True, count=True)

            # description2 - pildi ja kuulamise vahel
            curArtDesc2 = ""
            if not curArtDesc2:
                curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//div[@itemprop="articleBody"]', parent=True, count=True)
            if not curArtDesc2:
                curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--htmlElement article-body__item--lead"]', parent=True, count=True, multi=True)
            if not curArtDesc2:
                curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//span[@class="figure__caption--title"][1]', parent=True, count=True)
            if not curArtDesc2:
                curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--htmlElement article-body--first-child article-body__item--lead"]', parent=True, count=True)
            if not curArtDesc2:
                curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//div[@itemprop="description"]', parent=True, count=True)

            # description3 - pärast kuulamist
            curArtDesc3 = ""
            if not curArtDesc3:
                curArtDesc3 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--htmlElement"]', parent=True, count=True, multi=True)
            if not curArtDesc3:
                curArtDesc3 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--premium-indicator"]', parent=True, count=True)

            # description4 - hall väljajuhatus
            curArtDesc4 = ""
            if not curArtDesc4:
                curArtDesc4 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--htmlElement article-body--teaser"]', parent=True, count=True)
            if not curArtDesc4:
                curArtDesc4 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--gallery"]', parent=True, count=True, multi=True)

            # image
            curArtImg = ""
            if not curArtImg:
                curArtImg = parsers_common.xpath_to("single", pageTree, '//div[@class="article-superheader article-superheader--figure"]/div[@class="article-superheader__background"]/@style', count=True)
                curArtImg = curArtImg.split("url('")[-1].strip("');")
            if not curArtImg:
                curArtImg = parsers_common.xpath_to("single", pageTree, '//figure[@class="figure"]/img[@class="figure--has-fullscreen"]/@src', count=True)
            if not curArtImg:
                curArtImg = parsers_common.xpath_to("single", pageTree, '//meta[@property="og:image"]/@content', count=True)

            # kontrollid
            if "-kuulutused-" in curArtUrl:
                rss_print.print_debug(__file__, "ei kontrolli plokke, kuna: kuulutused", 2)
            elif "-karikatuur" in curArtUrl:
                rss_print.print_debug(__file__, "ei kontrolli plokke, kuna: karikatuur", 2)
            else:
                if not curArtDesc1:
                    rss_print.print_debug(__file__, "1. plokk on tühi. (Pildieelne loendiplokk puudub?)", 2)
                else:
                    rss_print.print_debug(__file__, "curArtDesc1 = " + curArtDesc1, 4)
                if not curArtDesc2:
                    rss_print.print_debug(__file__, "2. plokk on tühi. (Pildi ja kuulamise vahe plokk puudub?) - " + curArtUrl, 0)
                else:
                    rss_print.print_debug(__file__, "curArtDesc2 = " + curArtDesc2, 4)
                if not curArtDesc3:
                    rss_print.print_debug(__file__, "3. plokk on tühi. (Pärast kuulamist plokk puudub?)", 0)
                else:
                    rss_print.print_debug(__file__, "curArtDesc3 = " + curArtDesc3, 4)
                if not curArtDesc4:
                    if "button--for-subscription" in curArtDesc3:
                        curArtDesc3 = curArtDesc3.split('<span class="button--for-subscription')[0]
                        rss_print.print_debug(__file__, "4. plokk on tühi. (Kolmandas plokis oli teemant)", 3)
                    else:
                        rss_print.print_debug(__file__, "4. plokk on tühi. (Hall fadeout plokk puudub?)", 2)
                else:
                    rss_print.print_debug(__file__, "curArtDesc4 = " + curArtDesc4, 4)
                if not curArtImg:
                    rss_print.print_debug(__file__, "pilti ei leitud.", 0)
                else:
                    rss_print.print_debug(__file__, "curArtImg = " + curArtImg, 4)

                if curArtDesc1 and curArtDesc1 == curArtDesc2:
                    rss_print.print_debug(__file__, "1. ja 2. plokk langevad kokku", 0)
                    rss_print.print_debug(__file__, "curArtDesc1 = " + curArtDesc1, 1)
                    rss_print.print_debug(__file__, "curArtDesc2 = " + curArtDesc2, 1)
                if curArtDesc2 and curArtDesc2 == curArtDesc3:
                    rss_print.print_debug(__file__, "2. ja 3. plokk langevad kokku", 0)
                    rss_print.print_debug(__file__, "curArtDesc2 = " + curArtDesc2, 1)
                    rss_print.print_debug(__file__, "curArtDesc3 = " + curArtDesc3, 1)
                if curArtDesc3 and curArtDesc3 == curArtDesc4:
                    rss_print.print_debug(__file__, "3. ja 4. plokk langevad kokku", 0)
                    rss_print.print_debug(__file__, "curArtDesc3 = " + curArtDesc3, 1)
                    rss_print.print_debug(__file__, "curArtDesc4 = " + curArtDesc4, 1)
                if curArtDesc4 and curArtDesc4 == curArtDesc1:
                    rss_print.print_debug(__file__, "4. ja 1. plokk langevad kokku", 0)
                    rss_print.print_debug(__file__, "curArtDesc4 = " + curArtDesc3, 1)
                    rss_print.print_debug(__file__, "curArtDesc1 = " + curArtDesc4, 1)

            curArtDesc = curArtDesc1 + ' ' + curArtDesc2 + ' ' + curArtDesc3 + ' ' + curArtDesc4

            if "button--for-subscription" in curArtDesc:
                curArtDesc = curArtDesc.replace(' tellijatele', '')
                curArtDesc = curArtDesc.replace('<a href="https://minumeedia.postimees.ee/kampaania/" target="_blank" class="my-media-link">digipaketi</a>', '')
                curArtDesc = curArtDesc.replace('<div class="article-body__item article-body__item--audio-teaser">', '<div>')
                curArtDesc = curArtDesc.replace('<div class="audio-teaser">', '<div>')
                curArtDesc = curArtDesc.replace('<img data-lazy-src="/v5/img/icons/diamond-black-on-yellow.svg" alt="Tellijale" src="/v5/img/icons/diamond-black-on-yellow.svg" width="30" height="30">', "")
                curArtDesc = curArtDesc.replace('<img src="/v5/img/icons/diamond-black-on-yellow.svg" alt="Tellijale" width="30" height="30">', "")
                curArtDesc = curArtDesc.replace('<span class="button--for-subscription">', "<span>")
                curArtDesc = curArtDesc.replace('<span class="button--for-subscription__diamond diamond--ee">', "<span>")
                curArtDesc = curArtDesc.replace('<span class="button--for-subscription__text"', "")
                curArtDesc = curArtDesc.replace('Artikkel on kuulatav', '')
                curArtDesc = curArtDesc.replace('Tellijale', '')

            articleDataDict["descriptions"] = parsers_common.list_add_or_assign(articleDataDict["descriptions"], i, curArtDesc)

            articleDataDict["images"] = parsers_common.list_add_or_assign(articleDataDict["images"], i, curArtImg)

    return articleDataDict