Ejemplo n.º 1
0
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["descriptions"] = parsers_common.xpath_to(
        "list",
        pageTree,
        '//ul/li/div/div/div[@class="horiz-offer-card__content"]/div[@class="horiz-offer-card__desc"]',
        parent=True)
    articleDataDict["images"] = parsers_common.xpath_to(
        "list", pageTree,
        '//ul/li/div/div/div[@class="horiz-offer-card__image"]/a/img/@data-src'
    )
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//ul/li/div/div/div[@class="horiz-offer-card__image"]/a/@title')
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//ul/li/div/div/div[@class="horiz-offer-card__image"]/a/@href')

    # remove unwanted content: titles
    dictList = [
        "radiaator",
    ]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "in", "titles")

    articleDataDict = parsers_common.dict_reverse_order(articleDataDict)

    return articleDataDict
Ejemplo n.º 2
0
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["descriptions"] = parsers_common.xpath_to(
        "list",
        pageTree,
        '//article/ul/li/figure/div[@class="offer-thumb__content"]',
        parent=True)
    articleDataDict["images"] = parsers_common.xpath_to(
        "list", pageTree, '//article/ul/li/figure/figure/a/@data-original')
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//article/ul/li/figure/div[@class="offer-thumb__content"]/h3/a/text()'
    )
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//article/ul/li/figure/div[@class="offer-thumb__content"]/h3/a/@href')

    # remove unwanted content: titles
    dictList = [
        "az ",
        "jawa",
        "kitarr",
        "poolvääriskivi",
        "tuulesuunaja",
        "viiuli",
        "xtrons",
    ]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "in", "titles")

    return articleDataDict
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["descriptions"] = parsers_common.xpath_to(
        "list",
        pageTree,
        '//table[@class="views-table cols-5"]/tbody/tr',
        parent=True)
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//table[@class="views-table cols-5"]/tbody/tr/td[1]/text()')
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//table[@class="views-table cols-5"]/tbody/tr/td[5]/div[1]/a/@href')

    # remove unwanted content: descriptions
    dictList = [
        "ametnik",
        "hooldaja",
        "jurist",
        "koristaja",
        "logopeed",
        "pedagoog",
        "psühholoog",
        "raamatupidaja",
        "sanitar",
        "teenindaja",
        "uurija",
        "õpetaja",
    ]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "in", "titles")

    return articleDataDict
Ejemplo n.º 4
0
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["descriptions"] = parsers_common.xpath_to(
        "list", pageTree, '//div[@id="t-content"]/table[1]/tr', parent=True)
    articleDataDict["pubDates"] = parsers_common.xpath_to(
        "list", pageTree, '//div[@id="t-content"]/table[1]/tr/td[1]/text()')
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@id="t-content"]/table[1]/tr/td[@class="left"]/b/a/text()')
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@id="t-content"]/table[1]/tr/td[@class="left"]/b/a/@href')

    # remove unwanted content: descriptions
    dictList = ["Tartu"]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "not in", "descriptions")

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        # pubDates magic from "29.08.19" to datetime()
        curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i)
        curArtPubDate = parsers_datetime.raw_to_datetime(
            curArtPubDate, "%d.%m.%y")
        articleDataDict["pubDates"] = parsers_common.list_add_or_assign(
            articleDataDict["pubDates"], i, curArtPubDate)

    return articleDataDict
Ejemplo n.º 5
0
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["authors"] = parsers_common.xpath_to(
        "list",
        pageTree,
        '//div[@class="message"]/div[@class="name"]',
        parent=True)
    articleDataDict["descriptions"] = parsers_common.xpath_to(
        "list",
        pageTree,
        '//div[@class="message"]/div[@class="content"]',
        parent=True)
    articleDataDict["pubDates"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="message"]/div[@class="posttime"]/text()')
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="message"]/div[@class="title"]/a[3]/text()')
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="message"]/div[@class="title"]/a[3]/@href')

    # remove unwanted content: titles
    dictList = [
        "Hoiatus! Läbisõit 100% keritud",
        "Kaebused",
    ]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "in", "titles")

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        # description
        curArtDesc = parsers_common.get(articleDataDict["descriptions"], i)
        curArtDesc = curArtDesc.split('<div class="userControls')[0]
        articleDataDict["descriptions"] = parsers_common.list_add_or_assign(
            articleDataDict["descriptions"], i, curArtDesc)

        # title
        curArtTitle = parsers_common.get(articleDataDict["titles"], i)
        curArtTitle = parsers_common.str_lchop(curArtTitle, "Re: ")
        curArtTitle = parsers_common.str_title_at_domain(curArtTitle, domain)
        articleDataDict["titles"] = parsers_common.list_add_or_assign(
            articleDataDict["titles"], i, curArtTitle)

        # pubDates magic from "20:22 01.09.2019" to Datetime()
        curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i)
        curArtPubDate = parsers_datetime.replace_string_with_timeformat(
            curArtPubDate, "eile", "%d.%m.%Y", offsetDays=-1)
        curArtPubDate = parsers_datetime.replace_string_with_timeformat(
            curArtPubDate, "täna", "%d.%m.%Y", offsetDays=0)
        curArtPubDate = parsers_datetime.add_missing_date_to_string(
            curArtPubDate, "%H:%M %d.%m.%Y", " %d.%m.%Y")
        curArtPubDate = parsers_datetime.raw_to_datetime(
            curArtPubDate, "%H:%M %d.%m.%Y")
        articleDataDict["pubDates"] = parsers_common.list_add_or_assign(
            articleDataDict["pubDates"], i, curArtPubDate)

    return articleDataDict
Ejemplo n.º 6
0
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="box-news-block-title "]/a[@title]/@title')
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="box-news-block-title "]/a[@title]/@href')

    # remove unwanted content: urls
    dictList = [
        "https://sky.ee/rock-fmi-hommikuprogramm-igal-toopaeval-kell-7-10/"
    ]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "in", "urls")

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        if parsers_common.should_get_article_body(i):
            curArtUrl = parsers_common.get(articleDataDict["urls"], i)

            # load article into tree
            pageTree = parsers_common.get_article_tree(domain,
                                                       curArtUrl,
                                                       cache='cacheAll')

            # author
            curArtAuthor = parsers_common.xpath_to(
                "single", pageTree,
                '//div[@class="post-content"]/div[@class="article-page-author"]/p/text()'
            )
            if ": " in curArtAuthor:
                curArtAuthor = curArtAuthor.split(": ")[1]
            articleDataDict["authors"] = parsers_common.list_add_or_assign(
                articleDataDict["authors"], i, curArtAuthor)

            # description
            curArtDesc1 = parsers_common.xpath_to(
                "single", pageTree,
                '//div[@class="posts"]/div[@class="two-side-content-container clearfix"][1]//div[@class="post-content"]/strong/p/text()'
            )
            curArtDesc2 = parsers_common.xpath_to(
                "single",
                pageTree,
                '//div[@class="posts"]/div[@class="two-side-content-container clearfix"][2]//div[@class="post-content"]',
                parent=True)
            curArtDesc = curArtDesc1 + curArtDesc2
            if '<p class="related-cta">LOE KA NEID UUDISEID!</p>' in curArtDesc:
                curArtDesc = curArtDesc.split(
                    '<p class="related-cta">LOE KA NEID UUDISEID!</p>')[0]
            articleDataDict[
                "descriptions"] = parsers_common.list_add_or_assign(
                    articleDataDict["descriptions"], i, curArtDesc)

    return articleDataDict
Ejemplo n.º 7
0
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["descriptions"] = parsers_common.xpath_to(
        "list",
        pageTree,
        '//div[@class="applied-jobs"]/div/div[@class="job-content"]/div[@class="left-content"]/p',
        parent=True)
    articleDataDict["pubDates"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="applied-jobs"]/div/div[@class="job-content"]/div[@class="right-content"]/div[@class="application-date"][1]/text()'
    )
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="applied-jobs"]/div/div[@class="job-content"]/div[@class="left-content"]/h2/a/text()'
    )
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="applied-jobs"]/div/div[@class="job-content"]/div[@class="left-content"]/h2/a/@href'
    )

    # remove unwanted content: descriptions
    dictList = [
        "jurist",
        "logopeed",
        "pedagoog",
        "psühholoog",
        "raamatupidaja",
        "sotsiaal",
        "õpetaja",
    ]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "in", "titles")

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        # pubDates magic from "Avaldatud: 12.12.2017" to datetime()
        curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i)
        curArtPubDate = curArtPubDate.split(': ')[1]
        curArtPubDate = parsers_datetime.raw_to_datetime(
            curArtPubDate, "%d.%m.%Y")
        articleDataDict["pubDates"] = parsers_common.list_add_or_assign(
            articleDataDict["pubDates"], i, curArtPubDate)

        # url
        curArtUrl = parsers_common.get(articleDataDict["urls"], i)
        curArtUrl = curArtUrl.split('?ref=')[0]
        articleDataDict["urls"] = parsers_common.list_add_or_assign(
            articleDataDict["urls"], i, curArtUrl)

    return articleDataDict
Ejemplo n.º 8
0
def fill_article_dict(articleDataDict, pageTree, domain):

    maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 10)
    maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX /
                            maxArticleBodies)  # set 0 for all posts

    parentPages = {}
    parentPages["stamps"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="reply-count"]/span[@data-xf-init="tooltip"]/text()')
    parentPages["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@qid="thread-item-parent"]/div[@qid="thread-item"]/div/div[@class="structItem-title"]/a[@qid="thread-item-title"]/text()'
    )
    parentPages["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@qid="thread-item-parent"]/div[@qid="thread-item"]/div/div[@class="structItem-title"]/a[@qid="thread-item-title"]/@href'
    )

    # remove unwanted content: titles
    dictList = [
        "$",
        "*:::::::the official what did you do to you mkiv today thread::::::::*",
        "??",
        "Ask a Simple Question",
    ]
    parentPages = parsers_common.article_data_dict_clean(
        parentPages, dictList, "in", "titles")

    # teemade läbivaatamine
    for i in parsers_common.article_urls_range(parentPages["urls"]):
        # teemalehe sisu hankimine
        if parsers_common.should_get_article_body(i, maxArticleBodies):
            curParentUrl = parsers_common.get(parentPages["urls"], i)
            curParentUrl = curParentUrl + "page-1000"
            parentPagesStamp = parsers_common.get(parentPages["stamps"], i)
            # load article into tree
            pageTree = parsers_common.get_article_tree(
                domain,
                curParentUrl,
                cache='cacheStamped',
                pageStamp=parentPagesStamp)

            articlePostsDict = {}
            articlePostsDict["authors"] = parsers_common.xpath_to(
                "list", pageTree, '//h4[@qid="message-username"]//text()')
            articlePostsDict["descriptions"] = parsers_common.xpath_to(
                "list",
                pageTree,
                '//article/div[@class="bbWrapper"]',
                parent=True)
            articlePostsDict["pubDates"] = parsers_common.xpath_to(
                "list", pageTree,
                '//div[@class="message-attribution-main"]/a[@class="u-concealed"][2]/time/@datetime'
            )
            articlePostsDict["urls"] = parsers_common.xpath_to(
                "list", pageTree,
                '//div[@class="message-attribution-main"]/a[@class="u-concealed"][2]/@href'
            )

            # teema postituste läbivaatamine
            for j in parsers_common.article_posts_range(
                    articlePostsDict["urls"], maxArticlePosts):
                # author
                articleDataDict["authors"] = parsers_common.list_add(
                    articleDataDict["authors"], j,
                    parsers_common.get(articlePostsDict["authors"], j))

                # description
                curArtDesc = parsers_common.get(
                    articlePostsDict["descriptions"], j)
                articleDataDict["descriptions"] = parsers_common.list_add(
                    articleDataDict["descriptions"], j, curArtDesc)

                # pubDates
                curArtPubDate = parsers_common.get(
                    articlePostsDict["pubDates"], j)
                curArtPubDate = parsers_datetime.raw_to_datetime(
                    curArtPubDate,
                    "%Y-%m-%dT%H:%M:%S%z")  # 2021-01-28T16:15:42-0500
                articleDataDict["pubDates"] = parsers_common.list_add(
                    articleDataDict["pubDates"], j, curArtPubDate)

                # title
                curArtTitle = parsers_common.get(parentPages["titles"], i)
                curArtTitle = parsers_common.str_title_at_domain(
                    curArtTitle, domain)
                articleDataDict["titles"] = parsers_common.list_add(
                    articleDataDict["titles"], j, curArtTitle)

                # url
                curArtUrl = parsers_common.get(articlePostsDict["urls"], j)
                articleDataDict["urls"] = parsers_common.list_add(
                    articleDataDict["urls"], j, curArtUrl)

                rss_print.print_debug(
                    __file__, "teema " + str(i + 1) + " postitus nr. " +
                    str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) +
                    ") on " + articlePostsDict["urls"][j], 2)

    return articleDataDict
Ejemplo n.º 9
0
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["descriptions"] = parsers_common.xpath_to(
        "list",
        pageTree,
        '/html/body/div/div/div/div[@id="page"]/ul/li[@class="clear "]/div[@class="content"]',
        parent=True)
    articleDataDict["images"] = parsers_common.xpath_to(
        "list", pageTree,
        '/html/body/div/div/div/div[@id="page"]/ul/li[@class="clear "]/p[@class="img"]/a/img/@src'
    )
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '/html/body/div/div/div/div[@id="page"]/ul/li[@class="clear "]/div[@class="content"]/h2/a/text()'
    )
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '/html/body/div/div/div/div[@id="page"]/ul/li[@class="clear "]/div[@class="content"]/h2/a/@href'
    )

    # video
    # remove unwanted content: titles
    dictList = [
        "2000.ee:",
        "AK filmikroonika 1958-1991:",
        "Aktuaalne kaamera",
        "Eesti Gloobus",
        "Hommik Anuga:",
        "Insight",
        "Iseolemine:",
        "Johannese lähetamine",
        "Kes keda?",
        "Kodukäijad",
        "Koolitants",
        "Lasteekraan",
        "Lastetuba",
        "Laulge kaasa!",
        "Meie kõrval:",
        "Mis teie kodus uudist?:",
        "Mis? Kus? Millal?:",
        "NOVA:",
        "Noor meister:",
        "Nädala intervjuu:",
        "OP:",
        "Oma tõde:",
        "Ongi Koik",
        "Pealtnägija",
        "Peegel:",
        "Plekktrumm",
        "Prillitoos:",
        "Püha päeva palvus:",
        "Rahvas laulab:",
        "Rakett 69:",
        "Reibas hommik",
        "Ringvaade",
        "Suus sulav Eesti:",
        "TECHnolik",
        "TV 10 olümpiastarti",
        "Taevavalvurid",
        "Tarmo ja Aet liiguvad:",
        "Terevisioon:",
        "Tähendamisi",
        "Tähtede lava",
        "Välisilm",
    ]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "in", "titles")

    # remove unwanted content: description
    dictList = [
        "Johannes Tralla",
    ]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "in", "descriptions")

    # audio
    # remove unwanted content: titles
    dictList = [
        "AIATARK.",
        "DELTA.",
        "GOGOL.",
        "HOMMIKUMÕTISKLUS.",
        "HUVITAJA.",
        "KIHNUKEELSED UUDISED",
        "KIRIKUELU.",
        "KITARRIMUUSIKAST JA -MÄNGIJAIST.",
        "KULDRANDEVUU.",
        "LOETUD JA KIRJUTATUD.",
        "LUULERUUM.",
        "LUULETUS.",
        "MELOTURNIIR.",
        "MINITURNIIR.",
        "MNEMOTURNIIR",
        "MULGIKEELSED UUDISED",
        "MUST KLAHV, VALGE KLAHV.",
        "NAISTESAUN.",
        "OLUKORRAST RIIGIS",
        "PÄEVAKAJA",
        "RAHVA OMA KAITSE",
        "RAHVA TEENRID",
        "REPORTERITUND.",
        "SETUKEELSED UUDISED",
        "SIILILEGI SELGE!",
        "SPORDIPÜHAPÄEV.",
        "TETRIS.",
        "UUDISED.",
        "VÕRUKEELSED UUDISED",
    ]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "in", "titles")

    return articleDataDict
Ejemplo n.º 10
0
        parsers_common.dict_stats(curArticleDataDict)

        # lisame viimased andmed
        rss_print.print_debug(
            __file__, "lisame andmed viimaselt alamlehelt: " + curDomainLong,
            2)
        articleDataDict = parsers_common.dict_add_dict(articleDataDict,
                                                       curArticleDataDict)

    if not articleDataDict["urls"]:
        rss_print.print_debug(__file__,
                              "ei leitud andmeid lehelt: " + curDomainShort, 0)
        continue

    # remove unwanted content: titles
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, rss_config.BAD_TITLES, "in", "titles")

    # remove unwanted content: descriptions
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, rss_config.BAD_DESCRIPTIONS, "in", "descriptions")

    # combine rss file
    rss_print.print_debug(__file__, "asume koostame rss-i: " + curDomainLong,
                          3)
    rssContent = rss_maker.rssmaker(articleDataDict, curTitle, curDomainShort,
                                    curDomainLong, curDescription)

    # make sure we have subfolder
    OS_PATH = os.path.dirname(os.path.abspath(__file__))
    LATEST_FEEDS_PATH = OS_PATH + "/" + "latest_feeds"
    if not os.path.exists(LATEST_FEEDS_PATH):
Ejemplo n.º 11
0
def fill_article_dict(articleDataDict, pageTree, domain):

    maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 5)
    maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX /
                            maxArticleBodies)  # set 0 for all posts

    parentPages = {}
    parentPages["stamps"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="tabs-content tab-active shadow"]/ul[@class="adds-list list"]/li[@class="adds-list-item"]/a[h3/@class="adds-list-title"]/h3[@class="adds-list-title"]/span[@class="adds-list-meta"]/text()'
    )
    parentPages["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="tabs-content tab-active shadow"]/ul[@class="adds-list list"]/li[@class="adds-list-item"]/a[h3/@class="adds-list-title"]/h3[@class="adds-list-title"]/text()'
    )
    parentPages["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="tabs-content tab-active shadow"]/ul[@class="adds-list list"]/li[@class="adds-list-item"]/a[h3/@class="adds-list-title"]/@href'
    )

    # muudame suuna sobivaks
    parentPages = parsers_common.dict_reverse_order(parentPages)

    # remove unwanted content: titles
    dictList = [
        "AMD",
        "Apple",
        "Assassin",
        "Batman",
        "Battlefield",
        "Call of Duty",
        "Cyberpunk",
        "Diablo 2",
        "Dying",
        "Escape From Tarkov",
        "Euro Truck",
        "Evil",
        "FIFA",
        "Far Cry",
        "Forza",
        "Galaxy",
        "Grand Theft",
        "IPhon",
        "Kindle",
        "MMORPG",
        "MSI",
        "MacBook",
        "MacOS",
        "Mafia",
        "Mass Effect",
        "Meizu",
        "Minecraft",
        "Nintendo",
        "Pixel",
        "PlayStation",
        "Steam",
        "Tanks",
        "Vidia",
        "War Thunder",
        "Watercool",
        "Windows",
        "Xbox",
        "arvutikast",
        "exile",
        "foorumiga seotud",
        "konsool",
        "korpust",
        "moderaatorite",
        "seotud vead",
        "siia lingid",
        "toiteplok",
    ]
    parentPages = parsers_common.article_data_dict_clean(
        parentPages, dictList, "in", "titles")

    # teemade läbivaatamine
    for i in parsers_common.article_urls_range(parentPages["urls"]):
        # teemalehe sisu hankimine
        if parsers_common.should_get_article_body(i, maxArticleBodies):
            parentPages["stamps"][i] = parentPages["stamps"][i].split("/")[0]
            parentPages["urls"][i] = parentPages["urls"][i].split("&sid=")[0]
            pageTree = parsers_common.get_article_tree(
                domain,
                parentPages["urls"][i],
                cache='cacheStamped',
                pageStamp=parentPages["stamps"][i])

            articlePostsDict = {}
            articlePostsDict["authors"] = parsers_common.xpath_to(
                "list", pageTree,
                '//table[@class="forumline"]/tr/td[1]/span[@class="name"]/b/a/text()'
            )
            articlePostsDict["descriptions1"] = parsers_common.xpath_to(
                "list",
                pageTree,
                '//table[@class="forumline"]/tr/td[2]/table/tr[3]/td/span[@class="postbody"][1]',
                parent=True)
            articlePostsDict["descriptions2"] = parsers_common.xpath_to(
                "list",
                pageTree,
                '//table[@class="forumline"]/tr/td[2]/table/tr[3]/td/span[@class="postbody"][2]',
                parent=True)
            articlePostsDict["pubDates"] = parsers_common.xpath_to(
                "list", pageTree,
                '//table[@class="forumline"]/tr/td[2]/table/tr[1]/td/span[@class="postdetails"]/span[@class="postdetails"][1]/text()[1]'
            )
            articlePostsDict["urls"] = parsers_common.xpath_to(
                "list", pageTree,
                '//table[@class="forumline"]/tr/td[2]/table/tr[1]/td/span[@class="postdetails"]/a/@href'
            )

            # teema postituste läbivaatamine
            for j in parsers_common.article_posts_range(
                    articlePostsDict["urls"], maxArticlePosts):
                # author
                articleDataDict["authors"] = parsers_common.list_add(
                    articleDataDict["authors"], j,
                    parsers_common.get(articlePostsDict["authors"], j))

                # description
                curArtDesc = parsers_common.get(
                    articlePostsDict["descriptions1"], j)
                if not curArtDesc:
                    curArtDesc = parsers_common.get(
                        articlePostsDict["descriptions2"], j)
                curArtDesc = curArtDesc.replace(
                    '</div><div class="quotecontent">', '<br>')
                curArtDesc = parsers_common.fix_quatation_tags(
                    curArtDesc, '<div class="quotetitle">', "</div>",
                    "<blockquote>", "</blockquote>")
                articleDataDict["descriptions"] = parsers_common.list_add(
                    articleDataDict["descriptions"], j, curArtDesc)

                # pubDates
                curArtPubDate = parsers_common.get(
                    articlePostsDict["pubDates"], j)
                curArtPubDate = curArtPubDate[0:16]
                curArtPubDate = parsers_datetime.guess_datetime(
                    curArtPubDate)  # 14.07.2020 07:59
                articleDataDict["pubDates"] = parsers_common.list_add(
                    articleDataDict["pubDates"], j, curArtPubDate)

                # title
                curArtTitle = parsers_common.get(parentPages["titles"], i)
                curArtTitle = parsers_common.str_title_at_domain(
                    curArtTitle, domain)
                articleDataDict["titles"] = parsers_common.list_add(
                    articleDataDict["titles"], j, curArtTitle)

                # url
                curArtUrl = parsers_common.get(articlePostsDict["urls"], j)
                curArtUrl = parsers_common.link_add_end(
                    curArtUrl, articlePostsDict["urls"][j])
                articleDataDict["urls"] = parsers_common.list_add(
                    articleDataDict["urls"], j, curArtUrl)

                rss_print.print_debug(
                    __file__, "teema " + str(i + 1) + " postitus nr. " +
                    str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) +
                    ") on " + articlePostsDict["urls"][j], 2)

    return articleDataDict
Ejemplo n.º 12
0
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["pubDates"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="history-item"]/span[@class="history-date ng-binding"]/text()'
    )
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="history-item"]/p[@class="history-header"]/a[@class="ng-binding"]/text()[1]'
    )
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="history-item"]/p[@class="history-header"]/a[@class="ng-binding"]/@href'
    )

    lastArtPubDate = ""
    iMinus = 0

    # remove unwanted content: titles
    dictList = [
        " reket",
        " võidu",
        "Aktuaalne kaamera",
        "ERR-i teleuudised",
        "ETV spordi",
        "Ilm ",
        "OTSE ",
        "Päevakaja",
        "Raadiouudised",
        "Viipekeelsed uudised",
    ]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "in", "titles")

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        i = i - iMinus

        # pubDates magic from "11:34" to datetime()
        curArtPubDate = parsers_common.get(articleDataDict["pubDates"], i)
        curArtPubDate = parsers_datetime.raw_to_datetime_guess_missing(
            curArtPubDate, lastArtPubDate, "%Y %m %d ", "%H:%M", -1)
        lastArtPubDate = curArtPubDate
        articleDataDict["pubDates"] = parsers_common.list_add_or_assign(
            articleDataDict["pubDates"], i, curArtPubDate)

        # title
        curArtTitle = parsers_common.get(articleDataDict["titles"], i)
        if "|" in curArtTitle:
            curArtTitle = curArtTitle.split("|")[-1]
        articleDataDict["titles"] = parsers_common.list_add_or_assign(
            articleDataDict["titles"], i, curArtTitle)

        if parsers_common.should_get_article_body(i):
            curArtUrl = parsers_common.get(articleDataDict["urls"], i)

            # load article into tree
            pageTree = parsers_common.get_article_tree(domain,
                                                       curArtUrl,
                                                       cache='cacheAll')

            # author
            curArtAuthor = parsers_common.xpath_to(
                "single", pageTree,
                '//article/div[@class="body"]/div/div[@class="meta"]/section/div[@class="byline"]/span/text()'
            )
            articleDataDict["authors"] = parsers_common.list_add_or_assign(
                articleDataDict["authors"], i, curArtAuthor)

            # description
            curArtDesc1 = parsers_common.xpath_to(
                "single",
                pageTree,
                '//article/div[@class="body"]/div[@class="lead"]',
                parent=True)
            curArtDesc2 = parsers_common.xpath_to(
                "single",
                pageTree,
                '//article/div[@class="body"]/div[@class="text flex-row"]',
                parent=True)
            if not curArtDesc2:
                rss_print.print_debug(
                    __file__, "tühja sisuga uudis, eemaldame rea " + str(i), 1)
                articleDataDict = parsers_common.dict_del_article_index(
                    articleDataDict, i)
                iMinus += 1
                continue
            curArtDesc = curArtDesc1 + "<br>" + curArtDesc2
            articleDataDict[
                "descriptions"] = parsers_common.list_add_or_assign(
                    articleDataDict["descriptions"], i, curArtDesc)

            # image
            curArtImg = parsers_common.xpath_to(
                "single", pageTree,
                '/html/head/meta[@property="og:image"]/@content')
            if not curArtImg:
                curArtImg = parsers_common.xpath_to(
                    "single", pageTree,
                    '/html/head/meta[@property="image"]/@content')
            articleDataDict["images"] = parsers_common.list_add_or_assign(
                articleDataDict["images"], i, curArtImg)

    articleDataDict = parsers_common.dict_reverse_order(articleDataDict)

    return articleDataDict
Ejemplo n.º 13
0
def fill_article_dict(articleDataDict, pageTree, domain):

    maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 8)
    maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX / maxArticleBodies)  # set 0 for all posts

    parentPages = {}
    version = 0
    if "kipper.ee" in domain or "militaar.net" in domain:
        version = 1
        parentPages["stamps"] = parsers_common.xpath_to("list", pageTree, '//table[@class="tablebg"]//tr/td/p[@class="topicdetails"]/text()')
        parentPages["titles"] = parsers_common.xpath_to("list", pageTree, '//table[@class="tablebg"]//tr/td/a[@class="topictitle"]/text()')
        parentPages["urls"] =   parsers_common.xpath_to("list", pageTree, '//table[@class="tablebg"]//tr/td/a[@class="topictitle"]/@href')
    else:
        parentPages["stamps"] = parsers_common.xpath_to("list", pageTree, '//ul[2]/li/dl/dd[@class="posts"]/text()')
        parentPages["titles"] = parsers_common.xpath_to("list", pageTree, '//ul[2]/li/dl/*/div/a[@class="topictitle"]/text()')
        parentPages["urls"] =   parsers_common.xpath_to("list", pageTree, '//ul[2]/li/dl/*/div/a[@class="topictitle"]/@href')

    if not parentPages["urls"] and "arutelud.com" in domain:
        rss_print.print_debug(__file__, "aktiivseid teemasid ei leitud, arutelude foorumis külastame mammutteemat", 1)
        parentPages["stamps"] = parsers_common.list_add_or_assign(parentPages["stamps"], 0, "")
        parentPages["titles"] = parsers_common.list_add_or_assign(parentPages["titles"], 0, "Arutelud")
        parentPages["urls"] =   parsers_common.list_add_or_assign(parentPages["urls"], 0, "https://arutelud.com/viewtopic.php?f=3&t=4&sd=d&sk=t&st=7")

    # remove unwanted content: titles
    dictList = [
        "Race.Fi:",
        "Write my",
        "PÕLVAMAA, VÕRUMAA JA VALGAMAA CB JA HAM SIDE",
    ]
    parentPages = parsers_common.article_data_dict_clean(parentPages, dictList, "in", "titles")

    # teemade läbivaatamine
    for i in parsers_common.article_urls_range(parentPages["urls"]):
        # teemalehe sisu hankimine
        if parsers_common.should_get_article_body(i, maxArticleBodies):
            curParentUrl = parsers_common.get(parentPages["urls"], i)
            curParentUrl = curParentUrl.split("&sid=")[0]
            curParentUrl = curParentUrl + "&start=100000"
            parentPagesStamp = parsers_common.get(parentPages["stamps"], i)
            # load article into tree
            pageTree = parsers_common.get_article_tree(domain, curParentUrl, cache='cacheStamped', pageStamp=parentPagesStamp)

            articlePostsDict = {}
            if version in (1, 2):
                rss_print.print_debug(__file__, "kasutame spetsiifilist hankimist, domain = " + domain, 2)
                articlePostsDict["authors"] =       parsers_common.xpath_to("list", pageTree, '//tr/td/b[@class="postauthor"]/text()')
                articlePostsDict["descriptions"] =  parsers_common.xpath_to("list", pageTree, '//tr/td/div[@class="postbody"][1]', parent=True)
                articlePostsDict["pubDates"] =      parsers_common.xpath_to("list", pageTree, '//tr/td[@class="gensmall"]/div[@style="float: right;"]/text()')
                articlePostsDict["urls"] =          parsers_common.xpath_to("list", pageTree, '//tr/td[@class="gensmall"]/div[@style="float: right;"]/a/@href')
            else:
                rss_print.print_debug(__file__, "kasutame üldist hankimist, domain = " + domain, 3)
                articlePostsDict["authors"] =       parsers_common.xpath_to("list", pageTree, '//p[@class="author"]//strong//text()')
                articlePostsDict["descriptions"] =  parsers_common.xpath_to("list", pageTree, '//div[@class="content"]', parent=True)
                articlePostsDict["pubDates"] =      parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/time/@datetime')
                articlePostsDict["urls"] =          parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/a/@href')

                if not articlePostsDict["pubDates"]:
                    articlePostsDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/time/text()')
                if not articlePostsDict["pubDates"]:
                    articlePostsDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/text()[1]')
                if len(articlePostsDict["pubDates"][0]) < 5:
                    rss_print.print_debug(__file__, "hangitud aeg[0] liiga lühike: '" + articlePostsDict["pubDates"][0] + "', proovime alternatiivi...", 0)
                    articlePostsDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//p[@class="author"]/text()[2]')
                    if len(articlePostsDict["pubDates"][0]) < 5:
                        rss_print.print_debug(__file__, "hangitud aeg[0] liiga lühike: '" + articlePostsDict["pubDates"][0] + "'", 0)
                    else:
                        rss_print.print_debug(__file__, "hangitud aeg[0]: '" + articlePostsDict["pubDates"][0] + "'", 4)
                if not articlePostsDict["pubDates"]:
                    rss_print.print_debug(__file__, "ei suutnud hankida ühtegi aega", 0)

            # teema postituste läbivaatamine
            for j in parsers_common.article_posts_range(articlePostsDict["urls"], maxArticlePosts):
                # author
                articleDataDict["authors"] = parsers_common.list_add(articleDataDict["authors"], j, parsers_common.get(articlePostsDict["authors"], j))

                # description
                curArtDesc = parsers_common.get(articlePostsDict["descriptions"], j)
                curArtDesc = curArtDesc.replace('</div><div class="quotecontent">', '<br>')
                curArtDesc = parsers_common.fix_quatation_tags(curArtDesc, '<div class="quotetitle">', "</div>", "<blockquote>", "</blockquote>")
                articleDataDict["descriptions"] = parsers_common.list_add(articleDataDict["descriptions"], j, curArtDesc)

                # pubDates
                curArtPubDate = parsers_common.get(articlePostsDict["pubDates"], j)
                curArtPubDate = parsers_datetime.months_to_int(curArtPubDate)
                curArtPubDate = parsers_datetime.remove_weekday_strings(curArtPubDate)
                curArtPubDate = parsers_datetime.replace_string_with_timeformat(curArtPubDate, "eile", "%d %m %Y", offsetDays=-1)
                curArtPubDate = parsers_datetime.replace_string_with_timeformat(curArtPubDate, "täna", "%d %m %Y", offsetDays=0)
                curArtPubDate = parsers_datetime.guess_datetime(curArtPubDate)
                articleDataDict["pubDates"] = parsers_common.list_add(articleDataDict["pubDates"], j, curArtPubDate)

                # title
                curArtTitle = parsers_common.get(parentPages["titles"], i)
                curArtTitle = parsers_common.str_title_at_domain(curArtTitle, domain)
                articleDataDict["titles"] = parsers_common.list_add(articleDataDict["titles"], j, curArtTitle)

                # url
                curArtUrl = parsers_common.get(articlePostsDict["urls"], j)
                curArtUrl = parsers_common.link_add_end(curArtUrl, articlePostsDict["urls"][j])
                articleDataDict["urls"] = parsers_common.list_add(articleDataDict["urls"], j, curArtUrl)

                rss_print.print_debug(__file__, "teema " + str(i + 1) + " postitus nr. " + str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) + ") on " + articlePostsDict["urls"][j], 2)

    return articleDataDict
Ejemplo n.º 14
0
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["pubDates"] = parsers_common.xpath_to("list", pageTree, '//div[@class="article-content"]/div[@class="article-content__meta"]/span[@class="article-content__date-published"]/text()')
    articleDataDict["titles"] = parsers_common.xpath_to("list", pageTree, '//div[@class="article-content"]/a[@class="article-content__headline"]/text()')
    articleDataDict["urls"] = parsers_common.xpath_to("list", pageTree, '//div[@class="article-content"]/a[@class="article-content__headline"]/@href')

    articleDataDictPubDatesDay = parsers_common.xpath_to("list", pageTree, '//div[@class="article-content"]/div[@class="article-content__meta"]/span[@class="article-content__date-published"]/span/text()')

    # remove unwanted content: titles
    dictList = [
        "Sakala kuulutused",
        "Tartu Börs,",
        "positiivseid proove",
    ]
    articleDataDict = parsers_common.article_data_dict_clean(articleDataDict, dictList, "in", "titles")

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        # pubDates magic from "24.12.2017 17:51" to datetime()
        curArtPubDateDay = ""
        if len(articleDataDictPubDatesDay) - 1 >= i:
            curArtPubDateDay = parsers_common.get(articleDataDictPubDatesDay, i)
            curArtPubDateDay = parsers_datetime.replace_string_with_timeformat(curArtPubDateDay, "Eile", "%d.%m.%Y", offsetDays=-1)
            curArtPubDateDay = parsers_datetime.replace_string_with_timeformat(curArtPubDateDay, "Täna", "%d.%m.%Y", offsetDays=0)

        curArtPubDate = articleDataDict["pubDates"][i]
        curArtPubDate = parsers_datetime.raw_to_datetime(curArtPubDateDay + curArtPubDate, "%d.%m.%Y, %H:%M")
        articleDataDict["pubDates"][i] = curArtPubDate

        if parsers_common.should_get_article_body(i):
            curArtUrl = parsers_common.get(articleDataDict["urls"], i)

            # load article into tree
            pageTree = parsers_common.get_article_tree(domain, curArtUrl, cache='cacheAll')

            # author
            curArtAuthor = parsers_common.xpath_to("single", pageTree, '//span[@class="article-authors__name"]/text()', multi=True)
            articleDataDict["authors"] = parsers_common.list_add_or_assign(articleDataDict["authors"], i, curArtAuthor)

            # description1 - enne pilti
            curArtDesc1 = ""
            if not curArtDesc1:
                curArtDesc1 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--video"][1]', parent=True, count=True)
            if not curArtDesc1:
                curArtDesc1 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--articleBullets"]', parent=True, count=True)

            # description2 - pildi ja kuulamise vahel
            curArtDesc2 = ""
            if not curArtDesc2:
                curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//div[@itemprop="articleBody"]', parent=True, count=True)
            if not curArtDesc2:
                curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--htmlElement article-body__item--lead"]', parent=True, count=True, multi=True)
            if not curArtDesc2:
                curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//span[@class="figure__caption--title"][1]', parent=True, count=True)
            if not curArtDesc2:
                curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--htmlElement article-body--first-child article-body__item--lead"]', parent=True, count=True)
            if not curArtDesc2:
                curArtDesc2 = parsers_common.xpath_to("single", pageTree, '//div[@itemprop="description"]', parent=True, count=True)

            # description3 - pärast kuulamist
            curArtDesc3 = ""
            if not curArtDesc3:
                curArtDesc3 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--htmlElement"]', parent=True, count=True, multi=True)
            if not curArtDesc3:
                curArtDesc3 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--premium-indicator"]', parent=True, count=True)

            # description4 - hall väljajuhatus
            curArtDesc4 = ""
            if not curArtDesc4:
                curArtDesc4 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--htmlElement article-body--teaser"]', parent=True, count=True)
            if not curArtDesc4:
                curArtDesc4 = parsers_common.xpath_to("single", pageTree, '//div[@class="article-body__item article-body__item--gallery"]', parent=True, count=True, multi=True)

            # image
            curArtImg = ""
            if not curArtImg:
                curArtImg = parsers_common.xpath_to("single", pageTree, '//div[@class="article-superheader article-superheader--figure"]/div[@class="article-superheader__background"]/@style', count=True)
                curArtImg = curArtImg.split("url('")[-1].strip("');")
            if not curArtImg:
                curArtImg = parsers_common.xpath_to("single", pageTree, '//figure[@class="figure"]/img[@class="figure--has-fullscreen"]/@src', count=True)
            if not curArtImg:
                curArtImg = parsers_common.xpath_to("single", pageTree, '//meta[@property="og:image"]/@content', count=True)

            # kontrollid
            if "-kuulutused-" in curArtUrl:
                rss_print.print_debug(__file__, "ei kontrolli plokke, kuna: kuulutused", 2)
            elif "-karikatuur" in curArtUrl:
                rss_print.print_debug(__file__, "ei kontrolli plokke, kuna: karikatuur", 2)
            else:
                if not curArtDesc1:
                    rss_print.print_debug(__file__, "1. plokk on tühi. (Pildieelne loendiplokk puudub?)", 2)
                else:
                    rss_print.print_debug(__file__, "curArtDesc1 = " + curArtDesc1, 4)
                if not curArtDesc2:
                    rss_print.print_debug(__file__, "2. plokk on tühi. (Pildi ja kuulamise vahe plokk puudub?) - " + curArtUrl, 0)
                else:
                    rss_print.print_debug(__file__, "curArtDesc2 = " + curArtDesc2, 4)
                if not curArtDesc3:
                    rss_print.print_debug(__file__, "3. plokk on tühi. (Pärast kuulamist plokk puudub?)", 0)
                else:
                    rss_print.print_debug(__file__, "curArtDesc3 = " + curArtDesc3, 4)
                if not curArtDesc4:
                    if "button--for-subscription" in curArtDesc3:
                        curArtDesc3 = curArtDesc3.split('<span class="button--for-subscription')[0]
                        rss_print.print_debug(__file__, "4. plokk on tühi. (Kolmandas plokis oli teemant)", 3)
                    else:
                        rss_print.print_debug(__file__, "4. plokk on tühi. (Hall fadeout plokk puudub?)", 2)
                else:
                    rss_print.print_debug(__file__, "curArtDesc4 = " + curArtDesc4, 4)
                if not curArtImg:
                    rss_print.print_debug(__file__, "pilti ei leitud.", 0)
                else:
                    rss_print.print_debug(__file__, "curArtImg = " + curArtImg, 4)

                if curArtDesc1 and curArtDesc1 == curArtDesc2:
                    rss_print.print_debug(__file__, "1. ja 2. plokk langevad kokku", 0)
                    rss_print.print_debug(__file__, "curArtDesc1 = " + curArtDesc1, 1)
                    rss_print.print_debug(__file__, "curArtDesc2 = " + curArtDesc2, 1)
                if curArtDesc2 and curArtDesc2 == curArtDesc3:
                    rss_print.print_debug(__file__, "2. ja 3. plokk langevad kokku", 0)
                    rss_print.print_debug(__file__, "curArtDesc2 = " + curArtDesc2, 1)
                    rss_print.print_debug(__file__, "curArtDesc3 = " + curArtDesc3, 1)
                if curArtDesc3 and curArtDesc3 == curArtDesc4:
                    rss_print.print_debug(__file__, "3. ja 4. plokk langevad kokku", 0)
                    rss_print.print_debug(__file__, "curArtDesc3 = " + curArtDesc3, 1)
                    rss_print.print_debug(__file__, "curArtDesc4 = " + curArtDesc4, 1)
                if curArtDesc4 and curArtDesc4 == curArtDesc1:
                    rss_print.print_debug(__file__, "4. ja 1. plokk langevad kokku", 0)
                    rss_print.print_debug(__file__, "curArtDesc4 = " + curArtDesc3, 1)
                    rss_print.print_debug(__file__, "curArtDesc1 = " + curArtDesc4, 1)

            curArtDesc = curArtDesc1 + ' ' + curArtDesc2 + ' ' + curArtDesc3 + ' ' + curArtDesc4

            if "button--for-subscription" in curArtDesc:
                curArtDesc = curArtDesc.replace(' tellijatele', '')
                curArtDesc = curArtDesc.replace('<a href="https://minumeedia.postimees.ee/kampaania/" target="_blank" class="my-media-link">digipaketi</a>', '')
                curArtDesc = curArtDesc.replace('<div class="article-body__item article-body__item--audio-teaser">', '<div>')
                curArtDesc = curArtDesc.replace('<div class="audio-teaser">', '<div>')
                curArtDesc = curArtDesc.replace('<img data-lazy-src="/v5/img/icons/diamond-black-on-yellow.svg" alt="Tellijale" src="/v5/img/icons/diamond-black-on-yellow.svg" width="30" height="30">', "")
                curArtDesc = curArtDesc.replace('<img src="/v5/img/icons/diamond-black-on-yellow.svg" alt="Tellijale" width="30" height="30">', "")
                curArtDesc = curArtDesc.replace('<span class="button--for-subscription">', "<span>")
                curArtDesc = curArtDesc.replace('<span class="button--for-subscription__diamond diamond--ee">', "<span>")
                curArtDesc = curArtDesc.replace('<span class="button--for-subscription__text"', "")
                curArtDesc = curArtDesc.replace('Artikkel on kuulatav', '')
                curArtDesc = curArtDesc.replace('Tellijale', '')

            articleDataDict["descriptions"] = parsers_common.list_add_or_assign(articleDataDict["descriptions"], i, curArtDesc)

            articleDataDict["images"] = parsers_common.list_add_or_assign(articleDataDict["images"], i, curArtImg)

    return articleDataDict
Ejemplo n.º 15
0
def fill_article_dict(articleDataDict, pageTree, domain):

    maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 1)
    maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX /
                            maxArticleBodies)  # set 0 for all posts

    parentPages = {}
    parentPages["stamps"] = parsers_common.xpath_to(
        "list", pageTree, '//tbody/tr/th[@class="col-1"]/text()')
    parentPages["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//tbody/tr/th[@class="col-7 teemapealkiri"]/a/text()')
    parentPages["urls"] = parsers_common.xpath_to(
        "list", pageTree, '//tbody/tr/th[@class="col-4"]/a/@href')

    # remove unwanted content: titles
    dictList = [
        "Lõvide perekonna uus teema",
        "abort",
        "beebi",
        "ivf",
        "lapse",
        "rase ",
        "rased",
        "triibupüüdjad",
    ]
    parentPages = parsers_common.article_data_dict_clean(
        parentPages, dictList, "in", "titles")

    # teemade läbivaatamine
    for i in parsers_common.article_urls_range(parentPages["urls"]):
        # teemalehe sisu hankimine
        if parsers_common.should_get_article_body(i, maxArticleBodies):
            curParentUrl = parsers_common.get(parentPages["urls"], i)
            curParentUrl = curParentUrl.split("/#")[0]
            pageTree = parsers_common.get_article_tree(
                domain,
                curParentUrl,
                cache='cacheStamped',
                pageStamp=parentPages["stamps"][i])

            articlePostsDict = {}
            articlePostsDict["descriptions"] = parsers_common.xpath_to(
                "list",
                pageTree,
                '//div[@class="bbp-reply-content entry-content"]',
                parent=True)
            articlePostsDict["pubDates"] = parsers_common.xpath_to(
                "list", pageTree,
                '//div[@class="post_date date updated"]/text()')
            articlePostsDict["urls"] = parsers_common.xpath_to(
                "list", pageTree,
                '//div[@class="bbp-reply-header entry-title"]/@id')

            # teema postituste läbivaatamine
            for j in parsers_common.article_posts_range(
                    articlePostsDict["urls"], maxArticlePosts):
                # description
                curArtDesc = parsers_common.get(
                    articlePostsDict["descriptions"], j)
                curArtDesc = curArtDesc.split(
                    '<div class="gdrts-rating-block ')[0]
                curArtDesc = parsers_html.html_remove_single_parents(
                    curArtDesc)
                articleDataDict["descriptions"] = parsers_common.list_add(
                    articleDataDict["descriptions"], j, curArtDesc)

                # pubDates
                curArtPubDate = parsers_common.get(
                    articlePostsDict["pubDates"], j)
                curArtPubDate = parsers_datetime.guess_datetime(curArtPubDate)
                articleDataDict["pubDates"] = parsers_common.list_add(
                    articleDataDict["pubDates"], j, curArtPubDate)

                # title
                curArtTitle = parsers_common.get(parentPages["titles"], i)
                curArtTitle = parsers_common.str_title_at_domain(
                    curArtTitle, domain)
                articleDataDict["titles"] = parsers_common.list_add(
                    articleDataDict["titles"], j, curArtTitle)

                # url
                curArtUrl = parsers_common.get(
                    parentPages["urls"],
                    i) + "/#" + articlePostsDict["urls"][j]
                articleDataDict["urls"] = parsers_common.list_add(
                    articleDataDict["urls"], j, curArtUrl)

                rss_print.print_debug(
                    __file__, "teema " + str(i + 1) + " postitus nr. " +
                    str(j + 1) + "/(" + str(len(articlePostsDict["urls"])) +
                    ") on " + articlePostsDict["urls"][j], 2)

    # remove unwanted content: descriptions
    dictList = [
        " liba "
        "Kommentaar eemaldatud.",
        "Liba?",
    ]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "in", "descriptions")

    return articleDataDict
Ejemplo n.º 16
0
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["descriptions"] = parsers_common.xpath_to(
        "list",
        pageTree,
        '//main/div[@class="content"]/div/div/div[1]/div/div',
        parent=True)
    articleDataDict["images"] = parsers_common.xpath_to(
        "list", pageTree,
        '//main/div[1]/div/div/div[1]/div/div/a/div/img/@src')
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//main/div[@class="content"]/div/div/div[1]/div/div/div/div[1]/div[2]/a/text()'
    )
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//main/div[@class="content"]/div/div/div[1]/div/div/div/div[1]/div[2]/a/@href'
    )

    # remove unwanted content: titles
    dictList = [
        "(uus) raamat",
        "abramova",
        "akvavit",
        "based broccoli",
        "beats of no nation",
        "bisweed",
        "ekkm",
        "error!",
        "floorshow",
        "gnoom",
        "hard feeler",
        "hillbilly picnic",
        "ida jutud",
        "ida räpp",
        "intro",
        "katus",
        "keskkonnatund",
        "kink konk",
        "korrosioon",
        "kräpp",
        "let me juke",
        "liin ",
        "liin",
        "lunchbreak lunchdate",
        "meie igapäevane avalik ruum",
        "milk",
        "muster",
        "myös",
        "müürilehe hommik",
        "n-lib"
        "oleneb päevast!",
        "oujee!",
        "paneel",
        "playa music",
        "propel",
        "puhkus",
        "rets records",
        "room_202",
        "rõhk",
        "saal raadio",
        "soojad suhted",
        "svet nureka",
        "söökladisko",
        "triinemets.",
        "vitamiin k",
        "zubrovka am",
        "ära kaaguta!",
        "öömaja",
    ]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "in", "titles")

    # remove unwanted content: descriptions
    dictList = [
        "#hip hop",
        "#interview",
        "#rap",
    ]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "in", "descriptions")

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        # title
        curArtTitle = parsers_common.get(articleDataDict["titles"], i)
        curArtTitle = parsers_common.str_title_at_domain(curArtTitle, domain)
        articleDataDict["titles"] = parsers_common.list_add_or_assign(
            articleDataDict["titles"], i, curArtTitle)

    return articleDataDict
Ejemplo n.º 17
0
def fill_article_dict(articleDataDict, pageTree, domain):

    articleDataDict["descriptions"] = parsers_common.xpath_to(
        "list",
        pageTree,
        '//div[@class="list-item"]/div[@class="details"]',
        parent=True)
    articleDataDict["images"] = parsers_common.xpath_to(
        "list", pageTree, '//div[@class="list-item"]/a[@class="image"]/@style')
    articleDataDict["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="list-item"]/div[@class="details"]/a[1]/text()')
    articleDataDict["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//div[@class="list-item"]/div[@class="details"]/a[1]/@href')

    # remove unwanted content: titles
    dictList = [
        "edasi lükatud",
        "ei toimu",
        "jääb ära",
        "lükkub edasi",
        "tühistatud",
    ]
    articleDataDict = parsers_common.article_data_dict_clean(
        articleDataDict, dictList, "in", "titles")

    for i in parsers_common.article_urls_range(articleDataDict["urls"]):
        # image
        curArtImage = parsers_common.get(articleDataDict["images"], i)
        curArtImage = parsers_common.split_failsafe(curArtImage, "'", 1)
        articleDataDict["images"] = parsers_common.list_add_or_assign(
            articleDataDict["images"], i, curArtImage)

        # url
        curArtUrl = parsers_common.get(articleDataDict["urls"], i)
        curArtUrl = curArtUrl.split("?event")[0]
        articleDataDict["urls"] = parsers_common.list_add_or_assign(
            articleDataDict["urls"], i, curArtUrl)

        if parsers_common.should_get_article_body(i):
            # load article into tree
            pageTree = parsers_common.get_article_tree(domain,
                                                       curArtUrl,
                                                       cache='cacheAll')

            # description
            curArtDesc1 = parsers_common.get(articleDataDict["descriptions"],
                                             i)
            curArtDesc2 = parsers_common.xpath_to("single",
                                                  pageTree,
                                                  '//article',
                                                  parent=True)
            curArtDesc2 = curArtDesc2.replace(
                "<h5 class=\"sm-hide\">Galerii</h5>", "")
            curArtDesc2 = curArtDesc2.replace(
                "<h5 class=\"sm-hide\">Tutvustus</h5>", "")
            curArtDesc2 = curArtDesc2.replace(
                "<div class=\"link after-arrow_down sm-show\">Loe lähemalt</div>",
                "")
            curArtDesc = curArtDesc1 + curArtDesc2
            articleDataDict[
                "descriptions"] = parsers_common.list_add_or_assign(
                    articleDataDict["descriptions"], i, curArtDesc)

    # muudame suuna sobivaks
    articleDataDict = parsers_common.dict_reverse_order(articleDataDict)

    return articleDataDict
Ejemplo n.º 18
0
def fill_article_dict(articleDataDict, pageTree, domain):

    maxArticleBodies = min(rss_config.REQUEST_ARTICLE_BODIES_MAX, 5)
    maxArticlePosts = round(rss_config.REQUEST_ARTICLE_POSTS_MAX /
                            maxArticleBodies)  # set 0 for all posts

    parentPages = {}
    parentPages["stamps"] = parsers_common.xpath_to(
        "list", pageTree,
        '//table[@class="grid zebra forum"]/tr/td[@class="meta"][4]/span/text()'
    )
    parentPages["titles"] = parsers_common.xpath_to(
        "list", pageTree,
        '//table[@class="grid zebra forum"]/tr/td[@class="title"]/a/@title')
    parentPages["urls"] = parsers_common.xpath_to(
        "list", pageTree,
        '//table[@class="grid zebra forum"]/tr/td[@class="title"]/a/@href')

    # remove unwanted content: titles
    dictList = [
        "Börsihai",
        "Cleveroni aktsiate ost/müük/oksjon",
        "Head uut aastat – prognoosid",
        "Keegi malet soovib mängida",
        "LHV Pank paremaks",
        "Uurimis- ja lõputööde küsimustikud",
    ]
    parentPages = parsers_common.article_data_dict_clean(
        parentPages, dictList, "in", "titles")

    # teemade läbivaatamine
    for i in parsers_common.article_urls_range(parentPages["urls"]):
        # teemalehe sisu hankimine
        if parsers_common.should_get_article_body(i, maxArticleBodies):
            # load article into tree
            pageTree = parsers_common.get_article_tree(
                domain,
                parentPages["urls"][i] +
                '?listEventId=jumpToPage&listEventParam=100&pagesOfMaxSize=true',
                cache='cacheStamped',
                pageStamp=parentPages["stamps"][i])

            articlePostsDict = {}
            articlePostsDict["authors"] = parsers_common.xpath_to(
                "list", pageTree,
                '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-header clear"]/p[@class="author"]/strong/a/text()'
            )
            articlePostsDict["descriptions"] = parsers_common.xpath_to(
                "list",
                pageTree,
                '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-content temporary-class"]',
                parent=True)
            articlePostsDict["pubDates"] = parsers_common.xpath_to(
                "list", pageTree,
                '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-header clear"]/div/p[@class="permalink"]/a/node()'
            )
            articlePostsDict["urls"] = parsers_common.xpath_to(
                "list", pageTree,
                '//ul[@class="forum-topic"]/li/div[@class="col2"]/div[@class="forum-header clear"]/div/p[@class="permalink"]/a/@href'
            )

            # teema postituste läbivaatamine
            for j in parsers_common.article_posts_range(
                    articlePostsDict["urls"], maxArticlePosts):
                # author
                articleDataDict["authors"] = parsers_common.list_add(
                    articleDataDict["authors"], j,
                    parsers_common.get(articlePostsDict["authors"], j))

                # description
                articleDataDict["descriptions"] = parsers_common.list_add(
                    articleDataDict["descriptions"], j,
                    parsers_common.get(articlePostsDict["descriptions"], j))

                # pubDates magic from "15.01.2012 23:49" to datetime()
                curArtPubDate = parsers_common.get(
                    articlePostsDict["pubDates"], j)
                curArtPubDate = parsers_datetime.replace_string_with_timeformat(
                    curArtPubDate, "Eile", "%d.%m.%Y", offsetDays=-1)
                curArtPubDate = parsers_datetime.add_missing_date_to_string(
                    curArtPubDate, "%d.%m.%Y %H:%M", "%d.%m.%Y ")
                curArtPubDate = parsers_datetime.raw_to_datetime(
                    curArtPubDate, "%d.%m.%Y %H:%M")
                articleDataDict["pubDates"] = parsers_common.list_add(
                    articleDataDict["pubDates"], j, curArtPubDate)

                # title
                curArtTitle = parsers_common.get(parentPages["titles"], i)
                curArtTitle = parsers_common.str_title_at_domain(
                    curArtTitle, domain)
                articleDataDict["titles"] = parsers_common.list_add(
                    articleDataDict["titles"], j, curArtTitle)

                # url
                curArtUrl = parentPages["urls"][i] + articlePostsDict["urls"][j]
                articleDataDict["urls"] = parsers_common.list_add(
                    articleDataDict["urls"], j, curArtUrl)

                rss_print.print_debug(
                    __file__, "teema postitus nr. " + str(j + 1) + "/(" +
                    str(len(articlePostsDict["urls"])) + ") on " +
                    articlePostsDict["urls"][j], 2)

    return articleDataDict