Ejemplo n.º 1
0
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit):
    """
    Meetod uudistesaidi kõigi uudiste nimekirja loomiseks
    """

    articleDescriptions = []
    articleIds = []
    articleImages = []
    articlePubDates = []
    articleTitles = pageTree.xpath('//div[@class="forum"][2]/ul/li/a/text()')
    articleUrls = pageTree.xpath('//div[@class="forum"][2]/ul/li/a/@href')
    articleUrls = parsers_common.domainUrls(domain, articleUrls)

    get_article_bodies = True

    for i in range(0, len(articleUrls)):
        articleUrl = articleUrls[i]

        # get unique id from articleUrl
        articleIds.append(
            articleUrl[articleUrl.index('&id=') +
                       4:articleUrl.index('&',
                                          articleUrl.index('&id=') + 4)])

        if (get_article_bodies is True and i < maxPageURLstoVisit):
            # load article into tree
            articleTree = makereq.getArticleData(articleUrl)

            # descriptions
            # curArtDescParent = parsers_common.treeExtract(articleTree, '//div[@id="content"]/div[@class="full_width"]/p[1]')   # as a parent
            # curArtDescChilds = parsers_common.stringify_children(curArtDescParent)
            # articleDescriptions.append(curArtDescChilds)
            articleDescriptions.append(extractArticleBody(articleTree))

            # images
            curArtPubImage = parsers_common.treeExtract(
                articleTree,
                '//div[@id="content"]/div[@class="full_width"]/a/img[@class="thumb"]/@src'
            )
            articleImages.append(curArtPubImage)

            # timeformat magic from "13/12/2017 22:24:59" to to datetime()
            curArtPubDate = parsers_common.treeExtract(
                articleTree,
                '//div[@id="content"]/div[@class="full_width"]/p[*]/i/b[2]/text()'
            )
            curArtPubDate = parsers_common.rawToDatetime(
                curArtPubDate, "%d/%m/%Y %H:%M:%S")
            articlePubDates.append(curArtPubDate)

    articleImages = parsers_common.domainUrls(domain, articleImages)

    return {
        "articleDescriptions": articleDescriptions,
        "articleIds": articleIds,
        "articleImages": articleImages,
        "articlePubDates": articlePubDates,
        "articleTitles": articleTitles,
        "articleUrls": articleUrls,
    }
Ejemplo n.º 2
0
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit):
    """
    Meetod Tartu aarete nimekirja loomiseks
    """

    articleDescriptions = []
    articleIds = []
    # articleImages = []
    articlePubDates = pageTree.xpath(
        '//div[@id="t-content"]/table[1]/tr/td[1]/text()')
    articleTitles = pageTree.xpath(
        '//div[@id="t-content"]/table[1]/tr/td[@class="left"]/b/a/text()')
    articleUrls = pageTree.xpath(
        '//div[@id="t-content"]/table[1]/tr/td[@class="left"]/b/a/@href')
    articleUrls = parsers_common.domainUrls(domain, articleUrls)

    articleDescriptionsParents = pageTree.xpath(
        '//div[@id="t-content"]/table[1]/tr')  # as a parent

    for i in range(0, len(articleUrls)):
        articleUrl = articleUrls[i]

        # get unique id from articleUrl
        articleIds.append(articleUrl.split('/')[-1])

        # descriptions
        curArtDescParent = articleDescriptionsParents[i]
        curArtDescChilds = parsers_common.stringify_children(curArtDescParent)
        articleDescriptions.append(curArtDescChilds)

        # timeformat magic from "12.12.2017" to datetime()
        curArtPubDate = articlePubDates[i]
        curArtPubDate = parsers_common.rawToDatetime(curArtPubDate, "%d.%m.%Y")
        articlePubDates[i] = curArtPubDate

    # remove non "Tartu" ocation lines
    retArticleDescriptions = []
    retArticleIds = []
    retArticleImages = []
    retArticlePubDates = []
    retArticleTitles = []
    retArticleUrls = []
    for i in range(0, len(articleUrls)):
        if ('Tartu' in articleDescriptions[i]):
            retArticleDescriptions.append(articleDescriptions[i])
            retArticleIds.append(articleIds[i])
            # retArticleImages.append(articleImages[i])
            retArticlePubDates.append(articlePubDates[i])
            retArticleTitles.append(articleTitles[i])
            retArticleUrls.append(articleUrls[i])

    return {
        "articleDescriptions": retArticleDescriptions,
        "articleIds": retArticleIds,
        "articleImages": retArticleImages,
        "articlePubDates": retArticlePubDates,
        "articleTitles": retArticleTitles,
        "articleUrls": retArticleUrls,
    }
Ejemplo n.º 3
0
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit):
    """
    Meetod uudistesaidi kõigi uudiste nimekirja loomiseks
    """

    articleDescriptions = []
    articleIds = []
    articleImages = []
    articlePubDates = []
    articleTitles = pageTree.xpath('//ul[@class="list search-items-list"]/li/span/a/text()')
    articleUrls = pageTree.xpath('//ul[@class="list search-items-list"]/li/span/a/@href')

    articlePubDatesRaw = pageTree.xpath('//ul[@class="list search-items-list"]/li/div/span[2]/text()')

    get_article_bodies = True

    for i in range(0, len(articleUrls)):
        articleUrl = articleUrls[i]

        # get unique id from ArticleUrl
        articleIds.append(articleUrl.split('/')[-2])

        if (get_article_bodies is True and i < maxPageURLstoVisit):
            # load article into tree
            articleTree = makereq.getArticleData(articleUrl)

            # description
            curArtDescParent1 = parsers_common.treeExtract(articleTree, '//main/article/div[@class="flex flex--align-items-stretch"]//section')  # as a parent
            curArtDescParent2 = parsers_common.treeExtract(articleTree, '//main/div[@class="wrap"]/div[@class="flex flex--align-items-stretch"]//section')  # as a parent
            curArtDescChilds1 = parsers_common.stringify_children(curArtDescParent1)
            curArtDescChilds2 = parsers_common.stringify_children(curArtDescParent2)
            articleDescriptions.append(curArtDescChilds1 + ' ' + curArtDescChilds2)

            # image
            curArtImg = parsers_common.treeExtract(articleTree, '//main/article/div[@class="flex flex--align-items-stretch"]//figure/img[1]/@src') or "//"
            curArtImg = "http:" + curArtImg
            articleImages.append(curArtImg)

            # timeformat magic from "24. detsember 2017 17:51" to datetime()
            curArtPubDate = articlePubDatesRaw[i]
            curArtPubDate = parsers_common.longMonthsToNumber(curArtPubDate)
            curArtPubDate = parsers_common.rawToDatetime(curArtPubDate, "%d. %m %Y %H:%M")
            articlePubDates.append(curArtPubDate)

    return {"articleDescriptions": articleDescriptions,
            "articleIds": articleIds,
            "articleImages": articleImages,
            "articlePubDates": articlePubDates,
            "articleTitles": articleTitles,
            "articleUrls": articleUrls,
           }
Ejemplo n.º 4
0
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit):
    """
    Meetod uudistesaidi kõigi uudiste nimekirja loomiseks
    """

    articleDescriptions = pageTree.xpath(
        '//div[@id="nsp-nsp-234"]//div[@class="nspArt nspCol1"]/div[@class="gkArtContentWrap"]/p[1]/text()'
    )
    articleIds = []
    articleImages = pageTree.xpath(
        '//div[@id="nsp-nsp-234"]//div[@class="nspArt nspCol1"]/a/img/@src')
    articlePubDates = []
    articleTitles = pageTree.xpath(
        '//div[@id="nsp-nsp-234"]//div[@class="nspArt nspCol1"]/div[@class="gkArtContentWrap"]/h4/a/text()'
    )
    articleUrls = pageTree.xpath(
        '//div[@id="nsp-nsp-234"]//div[@class="nspArt nspCol1"]/div[@class="gkArtContentWrap"]/h4/a/@href'
    )
    articleUrls = parsers_common.domainUrls(domain, articleUrls)

    # todo(reading times from articles is BROKEN and maybe useless too)
    get_article_bodies = False

    for i in range(0, len(articleUrls)):
        articleUrl = articleUrls[i]

        # generate unique id from articleUrl
        articleIds.append(parsers_common.urlToHash(articleUrl))

        if (get_article_bodies is True and i < maxPageURLstoVisit):
            # load article into tree
            articleTree = makereq.getArticleData(articleUrl)

            # timeformat magic from "Avaldatud: Neljapäev, 14 Detsember 2017 12:46" to datetime()
            # curArtPubDate = parsers_common.treeExtract(articleTree, '//div[@class="kakk-postheadericons kakk-metadata-icons"]/span/::before/text()')  # katki
            curArtPubDate = parsers_common.treeExtract(
                articleTree, '//span[@class="kakk-postdateicon"]//text()')
            curArtPubDate = parsers_common.longMonthsToNumber(
                curArtPubDate.split(',')[1])
            curArtPubDate = parsers_common.rawToDatetime(
                curArtPubDate, "%d %m %Y %H:%M")
            articlePubDates.append(curArtPubDate)

    return {
        "articleDescriptions": articleDescriptions,
        "articleIds": articleIds,
        "articleImages": articleImages,
        "articlePubDates": articlePubDates,
        "articleTitles": articleTitles,
        "articleUrls": articleUrls,
    }
Ejemplo n.º 5
0
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit):
    """
    Meetod uudistesaidi kõigi uudiste nimekirja loomiseks
    """

    articleDescriptions = []
    articleIds = []
    articleImages = pageTree.xpath(
        '//div[@class="col-sm-6"]/div[@class="post-item"]/a/div/img/@src')
    articlePubDates = []
    articleTitles = pageTree.xpath(
        '//div[@class="col-sm-6"]/div[@class="post-item"]/a/h3/text()')
    articleUrls = pageTree.xpath(
        '//div[@class="col-sm-6"]/div[@class="post-item"]/a/@href')

    get_article_bodies = False

    for i in range(0, len(articleUrls)):
        articleUrl = articleUrls[i]

        # generate unique id from ArticleUrl
        articleIds.append(parsers_common.urlToHash(articleUrl))

        if (get_article_bodies is True and i < maxPageURLstoVisit):
            # load article into tree
            articleTree = makereq.getArticleData(articleUrl)

            # get first paragraph as header
            curArtDesc = parsers_common.treeExtract(
                articleTree, '//div[@class="col-sm-9"]/p[1]/strong/text()')
            articleDescriptions.append(curArtDesc)

            # timeformat magic from "Avaldatud: 14 detsember, 2017" to datetime()
            curArtPubDate = parsers_common.treeExtract(
                articleTree,
                '//div[@class="col-sm-9"]/div[@class="page-header"]/em/text()')
            curArtPubDate = parsers_common.longMonthsToNumber(
                curArtPubDate.split(':')[1])
            curArtPubDate = parsers_common.rawToDatetime(
                curArtPubDate, "%d %m, %Y")
            articlePubDates.append(curArtPubDate)

    return {
        "articleDescriptions": articleDescriptions,
        "articleIds": articleIds,
        "articleImages": articleImages,
        "articlePubDates": articlePubDates,
        "articleTitles": articleTitles,
        "articleUrls": articleUrls,
    }
Ejemplo n.º 6
0
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit):
    """
    Meetod saidi pakkumiste nimekirja loomiseks
    """

    articleDescriptions = []
    articleIds = []
    articleImages = []
    articlePubDates = []
    articleTitles = pageTree.xpath(
        '//table[@class="footable"]/tbody/tr/td[1]/a/strong/text()')
    articleUrls = pageTree.xpath(
        '//table[@class="footable"]/tbody/tr/td[1]/a/@href')

    articleDescName = pageTree.xpath(
        '//table[@class="footable"]/tbody/tr/td[1]/text()[2]')
    articleDescLoc = pageTree.xpath(
        '//table[@class="footable"]/tbody/tr/td[4]/text()')
    articlePubDatesRaw = pageTree.xpath(
        '//table[@class="footable"]/tbody/tr/td[2]/text()')

    for i in range(0, len(articleUrls)):
        articleUrl = articleUrls[i]

        # get unique id from articleUrl
        articleIds.append(articleUrl.split('-')[-1])

        # descriptions
        articleDescriptions.append(
            parsers_common.toPlaintext(articleDescName[i]) + "<br>" +
            parsers_common.toPlaintext(articleDescLoc[i]))

        # timeformat magic from "12.12.2017" to datetime()
        curArtPubDate = articlePubDatesRaw[i]
        curArtPubDate = parsers_common.rawToDatetime(curArtPubDate, "%d.%m.%Y")
        articlePubDates.append(curArtPubDate)

        # titles
        articleTitles[i] = parsers_common.toPlaintext(
            articleTitles[i]).capitalize()

    return {
        "articleDescriptions": articleDescriptions,
        "articleIds": articleIds,
        "articleImages": articleImages,
        "articlePubDates": articlePubDates,
        "articleTitles": articleTitles,
        "articleUrls": articleUrls,
    }
Ejemplo n.º 7
0
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit):
    """
    Meetod uudistesaidi kõigi uudiste nimekirja loomiseks
    """

    articleDescriptions = []
    articleIds = []
    articleImages = []
    articlePubDates = []
    articleTitles = pageTree.xpath(
        '//div[@class="audiolist_item"]/div[@class="audiolist_item_header"]/div[@class="audiolist_item_label"]/text()'
    )
    articleUrls = pageTree.xpath(
        '//div[@class="audiolist_item"]/div[@class="audiolist_item_header"]/a/@href'
    )

    articleDescriptionsParents = pageTree.xpath(
        '//div[@class="audiolist_item"]/div[@class="audiolist_item_bottom"]/div[@class="audioitem_item_desc"]'
    )  # as a parent
    articlePubDatesRaw = pageTree.xpath(
        '//div[@class="audiolist_item"]/div[@class="audiolist_item_header"]/div[@class="audiolist_item_label"]/text()'
    )

    for i in range(0, len(articleUrls)):
        articleUrl = articleUrls[i]

        # generate unique id from articleUrl
        articleIds.append(parsers_common.urlToHash(articleUrl))

        # descriptions
        curArtDesc = articleDescriptionsParents[i]
        curArtDesc = parsers_common.stringify_children(curArtDesc)
        articleDescriptions.append(curArtDesc)

        # timeformat magic from "15.12.2017 - L" to datetime()
        curArtPubDate = articlePubDatesRaw[i].split('-')[0]
        curArtPubDate = parsers_common.shortMonthsToNumber(curArtPubDate)
        curArtPubDate = parsers_common.rawToDatetime(curArtPubDate, "%d.%m.%Y")
        articlePubDates.append(curArtPubDate)

    return {
        "articleDescriptions": articleDescriptions,
        "articleIds": articleIds,
        "articleImages": articleImages,
        "articlePubDates": articlePubDates,
        "articleTitles": articleTitles,
        "articleUrls": articleUrls,
    }
Ejemplo n.º 8
0
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit):
    """
    Meetod uudistesaidi kõigi uudiste nimekirja loomiseks
    """

    articleDescriptions = pageTree.xpath('//div[@class="js-newsline-container"]/div/a/text()')
    articleIds = []
    articleImages = []
    articlePubDates = []
    articleTitles = pageTree.xpath('//div[@class="js-newsline-container"]/div/a/text()')
    articleUrls = pageTree.xpath('//div[@class="js-newsline-container"]/div/a/@href')

    articlePubDatesRaw = pageTree.xpath('//div[@class="js-newsline-container"]/span[1]/text()')
    articleDescriptionsTag = pageTree.xpath('//div[@class="js-newsline-container"]/div/span[1]/text()')

    for i in range(0, len(articleUrls)):
        articleUrl = articleUrls[i]

        # get unical id from articleUrl
        articleIds.append(articleUrl.split('/')[-2])

        # description
        articleDescriptions[i] = articleDescriptionsTag[i] + "<br>" + articleDescriptions[i]

        # timeformat magic from "14 dets  2017 11:34" to datetime()
        curArtPubDate = articlePubDatesRaw[i]
        curArtPubDate = parsers_common.shortMonthsToNumber(curArtPubDate)
        curArtPubDate = parsers_common.rawToDatetime(curArtPubDate, "%d %m %Y %H:%M")
        articlePubDates.append(curArtPubDate)

    return {"articleDescriptions": articleDescriptions,
            "articleIds": articleIds,
            "articleImages": articleImages,
            "articlePubDates": articlePubDates,
            "articleTitles": articleTitles,
            "articleUrls": articleUrls,
           }
Ejemplo n.º 9
0
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit):
    """
    Meetod uudistesaidi kõigi uudiste nimekirja loomiseks
    """

    articleDescriptions = pageTree.xpath('//div[@class="midColPost"]/p/text()')
    articleIds = []
    articleImages = pageTree.xpath('//div[@class="midColPost"]/a/img/@src')
    articlePubDates = []
    articleTitles = pageTree.xpath('//div[@class="midColPost"]/h2/a/@title')
    articleUrls = pageTree.xpath('//div[@class="midColPost"]/h2/a/@href')

    articlePubDatesRaw = pageTree.xpath(
        '//div[@class="midColPost"]/span/text()[1]')

    for i in range(0, len(articleUrls)):
        articleUrl = articleUrls[i]

        # get unique id from articleUrl
        articleIds.append(articleUrl.split("?p=")[1])

        # timeformat magic from "15. detsember 2017 / " to datetime()
        curArtPubDate = articlePubDatesRaw[i]
        curArtPubDate = parsers_common.longMonthsToNumber(curArtPubDate)
        curArtPubDate = parsers_common.rawToDatetime(curArtPubDate,
                                                     "%d. %m %Y /")
        articlePubDates.append(curArtPubDate)

    return {
        "articleDescriptions": articleDescriptions,
        "articleIds": articleIds,
        "articleImages": articleImages,
        "articlePubDates": articlePubDates,
        "articleTitles": articleTitles,
        "articleUrls": articleUrls,
    }
Ejemplo n.º 10
0
def getArticleListsFromHtml(pageTree, domain, maxPageURLstoVisit):
    """
    Meetod uudistesaidi kõigi uudiste nimekirja loomiseks
    """

    articleDescriptions = pageTree.xpath(
        '//div[@class="news-list-item-wrapper"]/div[@class="news-list-item-excerpt"]/p/text()'
    )
    articleIds = []
    articleImages = pageTree.xpath('//div[@class="news-list-media"]/img/@src')
    articleImages = parsers_common.domainUrls(domain, articleImages)
    articlePubDates = []
    articleTitles = pageTree.xpath(
        '//div[@class="news-list-item-wrapper"]/h3/a/text()')
    articleUrls = pageTree.xpath(
        '//div[@class="news-list-item-wrapper"]/h3/a/@href')
    articleUrls = parsers_common.domainUrls(domain, articleUrls)

    articlePubDay = pageTree.xpath(
        '//div[@class="news-list-item-wrapper"]/div[@class="news-list-item-date"]/text()[1]'
    )
    articlePubMonth = pageTree.xpath(
        '//div[@class="news-list-item-wrapper"]/div[@class="news-list-item-date"]/span[@class="month"]/text()'
    )
    articlePubYear = pageTree.xpath(
        '//div[@class="news-list-item-wrapper"]/div[@class="news-list-item-date"]/text()[2]'
    )

    get_article_bodies = True

    for i in range(0, len(articleUrls)):
        articleUrl = articleUrls[i]

        # generate unical id from ArticleUrl
        articleIds.append(parsers_common.urlToHash(articleUrl))

        if (get_article_bodies is True and i < maxPageURLstoVisit):
            # load article into tree
            articleTree = makereq.getArticleData(articleUrl)

            # descriptions
            curArtDescParent = parsers_common.treeExtract(
                articleTree,
                '//div[@class="news-single-item"]/div[@class="news-single-content"]'
            )  # as a parent
            curArtDescChilds = parsers_common.stringify_children(
                curArtDescParent)
            articleDescriptions[i] = curArtDescChilds

            # timeformat magic from "13 dets  17" to datetime()
            curArtPubDate = parsers_common.treeExtract(
                articleTree, '//div[@class="news-single-timedata"]/text()')
            curArtPubDate = parsers_common.shortMonthsToNumber(curArtPubDate)
            curArtPubDate = parsers_common.rawToDatetime(
                curArtPubDate, "%d %m %y")
            articlePubDates.append(curArtPubDate)
        else:
            if i < len(articlePubYear) and (int(articlePubYear[i].strip()) >
                                            2016):
                curYear = articlePubYear[i].strip()
            curArtPubDate = articlePubDay[i].strip(
            ) + " " + articlePubMonth[i].strip() + " " + curYear
            curArtPubDate = parsers_common.rawToDatetime(
                curArtPubDate, "%d %m %Y")
            articlePubDates.append(curArtPubDate)

    return {
        "articleDescriptions": articleDescriptions,
        "articleImages": articleImages,
        "articleIds": articleIds,
        "articlePubDates": articlePubDates,
        "articleTitles": articleTitles,
        "articleUrls": articleUrls,
    }