예제 #1
0
def update():
    paper = "jerusalempost"
    feeds = (
        "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463156",
        "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333498",
        "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463144",
        "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333468"
    )

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Change links to printable
    actualurls = []
    for link in updatepaper.links:
        actualurl = link.replace("ShowFull", "Printer")
        actualurls.append(actualurl)

    #Download the links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    #Insert the modified links into the DB
    updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
예제 #2
0
파일: foxnews.py 프로젝트: lonicera/aur
def update():
    paper = "foxnews"
    feeds = ("http://feeds.foxnews.com/foxnews/world?format=xml",
             "http://feeds.foxnews.com/foxnews/politics?format=xml")

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Modify links for printable version
    beginurl = "http://www.foxnews.com/printer_friendly_story/"
    actualurls = []
    for link in updatepaper.links:
        actualurl = beginurl + link.split('/')[-1].replace("2933", "3566")
        actualurls.append(actualurl)

    #Download the links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    #Make sure failures are taken out
    actualinsertfiles = []
    actualtitles = []
    for num, outfile in enumerate(updatepaper.outfiles):
        if not "404 Not Found" in outfile and not "Page cannot be found" in outfile:
            actualinsertfiles.append(outfile)
            actualtitles.append(updatepaper.outtitles[num])

    #Insert the modified links into the DB
    updatepaper.insert(paper, actualinsertfiles, actualtitles)
예제 #3
0
def update():
    paper = "economist"
    feeds = ("http://www.economist.com/rss/briefings_rss.xml",
             "http://www.economist.com/rss/europe_rss.xml",
             "http://www.economist.com/rss/united_states_rss.xml",
             "http://www.economist.com/rss/the_americas_rss.xml",
             "http://www.economist.com/rss/middle_east_and_africa_rss.xml",
             "http://www.economist.com/rss/asia_rss.xml",
             "http://www.economist.com/rss/international_rss.xml",
             "http://www.economist.com/rss/finance_and_economics_rss.xml")

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Modify links for printable version
    actualurls = []
    for link in updatepaper.links:
        actualurl = link.replace("displaystory", "PrinterFriendly")
        actualurls.append(actualurl)

    #Download the links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    #Insert the modified links into the DB
    updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
예제 #4
0
파일: allafrica.py 프로젝트: lonicera/aur
def update():
    paper = "allafrica"
    feeds = ("http://allafrica.com/tools/headlines/rdf/latest/headlines.rdf", )

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Do allaffrica specific stuff to the links
    beginurl = "http://allafrica.com/stories/printable/"
    actualurls = []
    for link in updatepaper.links:
        splitlink = link.split("/")
        actualurl = beginurl + splitlink[-1]
        actualurls.append(actualurl)

    #Download the modified links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    #Insert the modified links into the DB
    updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
예제 #5
0
파일: bostonglobe.py 프로젝트: lonicera/aur
def update():
    paper = "bostonglobe"
    feeds = ("http://syndication.boston.com/news/nation?mode=rss_10",
             "http://syndication.boston.com/news/politics/?mode=rss_10",
             "http://syndication.boston.com/business/?mode=rss_10")

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Do bostonglobe specific stuff to the links
    endurl = "?mode=PF"
    actualurls = []
    for link in updatepaper.links:
        actualurls.append(link.split('?')[0] + endurl)

    #Download the modified links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    if len(updatepaper.outfiles) == 0:
        return 0

    #Strip some bad stuff out of the downloaded files
    insertfiles = []
    for file in updatepaper.outfiles:
        insertfiles.append(
            file.replace("document.location.replace(csplit);", ""))

    #Insert the modified links into the DB
    updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
예제 #6
0
def update():
    paper = "spiegel"
    feeds = ("http://www.spiegel.de/schlagzeilen/rss/0,5291,676,00.xml", )

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #pintable
    actualurls = []
    for link in updatepaper.links:
        splitlink = link.split(",")
        actualurl = splitlink[0] + "," + splitlink[1] + ",druck-" + splitlink[
            2] + "," + splitlink[3]
        actualurls.append(actualurl)

    #Download the links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    if len(updatepaper.outfiles) == 0:
        return 0

    #Strip some bad stuff out of the downloaded files
    insertfiles = []
    for file in updatepaper.outfiles:
        insertfiles.append(file.replace('window.print()', ""))

    #Insert the modified links into the DB
    updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
예제 #7
0
def update():
    paper = "sydneymorningherald"
    feeds = ("http://feeds.smh.com.au/rssheadlines/top.xml",
             "http://feeds.smh.com.au/rssheadlines/national.xml",
             "http://feeds.smh.com.au/rssheadlines/world.xml")

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #printable
    beginurl = "http://www.smh.com.au/cgi-bin/common/popupPrintArticle.pl?path=/articles/"
    actualurls = []
    for link in updatepaper.links:
        splitlink = link.split("/")
        actualurl = beginurl + splitlink[-4] + "/" + splitlink[
            -3] + "/" + splitlink[-2] + "/" + splitlink[-1]
        actualurls.append(actualurl)

    #Download the links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    #Insert the modified links into the DB
    updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
예제 #8
0
def update():
    paper = "washingtonpost"
    feeds = (
        "http://feeds.washingtonpost.com/wp-dyn/rss/politics/administration/index_xml",
        "http://feeds.washingtonpost.com/wp-dyn/rss/politics/congress/index_xml",
        "http://feeds.washingtonpost.com/wp-dyn/rss/politics/elections/index_xml",
        "http://feeds.washingtonpost.com/wp-dyn/rss/world/index_xml",
        "http://feeds.washingtonpost.com/wp-dyn/rss/business/economy/index_xml"
    )

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds, "id")

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #printable
    actualurls = []
    for link in updatepaper.links:
        splitlink = link.split(".")
        if splitlink[3:]:
            actualurl = splitlink[0] + "." + splitlink[1] + "." + splitlink[
                2] + "_pf." + splitlink[3]
            actualurls.append(actualurl)

    #Download modified links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    #Insert the modified links into the DB
    updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
예제 #9
0
def update():
    paper = "bbc"
    feeds = (
        "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml",
        "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml",
        "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml",
        "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml",
        "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/middle_east/rss.xml",
        "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml"
    )

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Do bbc specific stuff to the links
    filter = ("sport", "default", "thread", "blogs", "picture_gallery",
              "pop_ups")
    beginurl = "http://newsvote.bbc.co.uk/mpapps/pagetools/print/news.bbc.co.uk"
    total = len(updatepaper.links)
    actualurls = []
    actualtitles = []
    for num in range(0, total):
        link = updatepaper.links[num]
        title = updatepaper.titles[num]

        append = "yes"
        for blacklist in filter:
            if blacklist in link:
                append = "no"
                break

        if append == "yes":
            splitlink = link.split("/")
            beginnum = splitlink.index("-") + 1
            actualurl = beginurl
            for urlchunk in splitlink[beginnum:]:
                actualurl = actualurl + "/" + urlchunk
            actualurls.append(actualurl)
            actualtitles.append(title)

    #Check to see if after filter, there are any urls left
    if len(actualurls) == 0:
        print("No new articles found.")
        return 0

    #Download the modified links
    updatepaper.download(paper, actualurls, actualtitles)

    #Insert the modified links into the DB
    updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
예제 #10
0
파일: latimes.py 프로젝트: lonicera/aur
def update():
    paper = "latimes"
    feeds = (
        "http://feeds.latimes.com/latimes/news/nationworld/nation?format=xml",
        "http://feeds.latimes.com/latimes/news/nationworld/world?format=xml")

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Scrap articles for printable link
    scrapeurls = []
    beginurl = "http://www.latimes.com/news"
    for url in updatepaper.links:
        urlparts = url.split("/")
        scrapeurl = beginurl + "/" + urlparts[6] + "/" + urlparts[
            7] + "/" + urlparts[-1]
        scrapeurls.append(scrapeurl)

    updatepaper.scrape(paper, scrapeurls, updatepaper.titles)

    if len(updatepaper.scrapefiles) == 0:
        print("No new articles found.")
        return 0

    #Get printable urls
    actualurls = []
    actualtitles = []
    beginurl = "http://www.latimes.com"
    total = len(updatepaper.scrapefiles)
    for num, file in enumerate(updatepaper.scrapefiles):
        for line in file:
            if '>Print<' in line:
                actualurls.append(beginurl + line.split('"')[1])
                actualtitles.append(updatepaper.scrapetitles[num])
                break

    #Download the scraped links
    updatepaper.download(paper, actualurls, actualtitles)

    #Insert the modified links into the DB
    updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
예제 #11
0
def update():
    paper = "financialtimes"
    feeds = ("http://www.ft.com/rss/world", "http://www.ft.com/rss/companies",
             "http://www.ft.com/rss/home/uk", "http://www.ft.com/rss/home/us",
             "http://www.ft.com/rss/home/europe",
             "http://www.ft.com/rss/home/asia", "http://www.ft.com/rss/lex")

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Modify links for printable version
    actualurls = []
    for link in updatepaper.links:
        actualurl = link.replace("0/", "").replace(".html", ",print=yes.html")
        actualurls.append(actualurl)

    #Download the links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    if len(updatepaper.outfiles) == 0:
        return 0

    #Format outputted files
    insertfiles = []
    for file in updatepaper.outfiles:
        readfile = file.split("\n")
        insertfile = "<b>Financial Times</b>"

        for line in readfile:
            if "ft-story-header" in line or "ft-story-body" in line or "charset" in line:
                insertfile = insertfile + "\n" + line

        insertfile = insertfile.replace('lang=     "en">', '').replace(
            '}// ]]></script>', '').replace('<script type="text/javascript">',
                                            '')
        insertfiles.append(insertfile)

    #Insert the modified links into the DB
    updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
예제 #12
0
def update():
    paper = "dallasmorningnews"
    feeds = (
        "http://www.dallasnews.com/newskiosk/rss/dallasnewsnationworld.xml",
        "http://www.dallasnews.com/newskiosk/rss/dallasnewsnationalpolitics.xml",
        "http://www.dallasnews.com/newskiosk/rss/dallasnewswash.xml",
        "http://www.dallasnews.com/newskiosk/rss/dallasnewsworld.xml")

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Download the links
    updatepaper.download(paper, updatepaper.links, updatepaper.titles)

    if len(updatepaper.outfiles) == 0:
        return 0

    #Format outputted files
    insertfiles = []
    for file in updatepaper.outfiles:
        readfile = file.split("\n")
        insertfile = "<b>Dallas Morning News</b>"

        dowrite = 0
        for line in readfile:
            if dowrite == 1:
                if "<!-- vstory end -->" in line:
                    break
                else:
                    insertfile = insertfile + "\n" + line
            elif "<!-- vstory begin -->" in line:
                dowrite = 1

        insertfiles.append(insertfile)

    #Insert the modified links into the DB
    updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
예제 #13
0
def update():
    paper = "chicagotribune"
    feeds = (
        "http://feeds.chicagotribune.com/chicagotribune/news/nationworld/",
        "http://feeds.chicagotribune.com/chicagotribune/news/nationworld/")

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Scrap articles for printable link
    updatepaper.scrape(paper, updatepaper.links, updatepaper.titles)

    if len(updatepaper.scrapefiles) == 0:
        print("No new articles found.")
        return 0

    #Get printable urls
    actualurls = []
    actualtitles = []
    beginurl = "http://www.chicagotribune.com"
    total = len(updatepaper.scrapefiles)
    for num, file in enumerate(updatepaper.scrapefiles):
        for line in file:
            if 'alt="Print"' in line:
                actualurls.append(beginurl + line.split('"')[5])
                actualtitles.append(updatepaper.scrapetitles[num])
                break

    #Download the scraped links
    updatepaper.download(paper, actualurls, actualtitles)

    #Insert the modified links into the DB
    updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
예제 #14
0
파일: haaretz.py 프로젝트: lonicera/aur
def update():
    paper = "haaretz"
    feeds = ("http://www.haaretz.com/feed/enewsRss.xml",
             "http://www.haaretz.com/feed/edefenseRss.xml",
             "http://www.haaretz.com/feed/enationalRss.xml",
             "http://www.haaretz.com/feed/ejewishworldRss.xml")

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    beginurl = "http://www.haaretz.com/hasen/objects/pages/PrintArticleEn.jhtml?itemNo="
    actualurls = []
    for link in updatepaper.links:
        actualurl = beginurl + link.split("/")[-1].replace(".html", "")
        actualurls.append(actualurl)

    #Download the links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    if len(updatepaper.outfiles) == 0:
        return 0

    #Strip some bad stuff out of the downloaded files
    insertfiles = []
    for file in updatepaper.outfiles:
        insertfiles.append(
            file.replace('<body bgcolor="" onload="print();">',
                         "").replace('charset="windows-1255"', ''))

    #Insert the modified links into the DB
    updatepaper.insert(paper, insertfiles, updatepaper.outtitles
                       )  # 1 - nothing was downloaded, 2 - keyboard interrupt
예제 #15
0
def update():
    paper = "seattlepi"
    feeds = ("http://seattlepi.nwsource.com/rss/apafrica.rss",
             "http://seattlepi.nwsource.com/rss/apaa.rss",
             "http://seattlepi.nwsource.com/rss/apasia.rss",
             "http://seattlepi.nwsource.com/rss/apelection.rss",
             "http://seattlepi.nwsource.com/rss/apmideast.rss",
             "http://seattlepi.nwsource.com/rss/apwash.rss")

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #printable version
    beginurl = "http://seattlepi.nwsource.com/printer2/index.asp?ploc=t&refer="
    actualurls = []
    for link in updatepaper.links:
        actualurl = beginurl + link
        actualurls.append(actualurl)

    #Download the links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    if len(updatepaper.outfiles) == 0:
        return 0

    #Strip some bad stuff out of the downloaded files
    insertfiles = []
    for file in updatepaper.outfiles:
        insertfiles.append(file.replace('window.print()', ""))

    #Insert the modified links into the DB
    updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
예제 #16
0
파일: japantimes.py 프로젝트: lonicera/aur
def update():
    paper = "japantimes"
    feeds = ("http://feeds.feedburner.com/japantimes_news", )

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Change links to printable
    beginurl = "http://search.japantimes.co.jp/print/"
    actualurls = []
    for link in updatepaper.links:
        actualurl = beginurl + link.split("/")[-1]
        actualurls.append(actualurl)

    #Download the links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    #Insert the modified links into the DB
    updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
예제 #17
0
파일: miamiherald.py 프로젝트: lonicera/aur
def update():
    paper = "miamiherald"
    feeds = ("http://www.miamiherald.com/884/index.xml",
             "http://www.miamiherald.com/509/index.xml",
             "http://www.miamiherald.com/578/index.xml",
             "http://www.miamiherald.com/579/index.xml",
             "http://www.miamiherald.com/581/index.xml",
             "http://www.miamiherald.com/582/index.xml")

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #change to printable
    actualurls = []
    for link in updatepaper.links:
        actualurl = link.replace("/story/", "/v-print/story/")
        actualurls.append(actualurl)

    #Download the links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    #Weed out the non complete articles
    insertfiles = []
    inserttitles = []
    for num, file in enumerate(updatepaper.outfiles):
        if not "Click here for full story" in file:
            insertfiles.append(file)
            inserttitles.append(updatepaper.outtitles[num])

    #Insert the modified links into the DB
    updatepaper.insert(paper, insertfiles, inserttitles)
예제 #18
0
파일: londontimes.py 프로젝트: lonicera/aur
def update():
    paper = "londontimes"
    feeds = ("http://www.timesonline.co.uk/tol/feeds/rss/worldnews.xml", )

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Change links to printable
    dlext = "?print=yes"
    actualurls = []
    for link in updatepaper.links:
        actualurl = link.split("#")[0] + dlext
        actualurls.append(actualurl)

    #Download the links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    #Insert the modified links into the DB
    updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)