Python getSoup Examples, scrapeBase.getSoup Python Examples

Example #1

0

Show file

File: formatDB.py Project: aamirz/333Project_webScraper_fossil

def postNass():
    # get the nass's data from source
    nassUrl = 'http://www.nassauweekly.com/'
    nassSoup = sb.getSoup(nassUrl)
    nassAboutUrl = 'http://www.nassauweekly.com/about/'
    nassAboutSoup = sb.getSoup(nassAboutUrl)

    # the logo
    #elements = nassSoup.select(".logo img")
    #el = sb.listCatchItem(elements)
    #logo = el["src"]
    logo = 'https://walkercarpenter.files.wordpress.com/2016/02/nass-circle.png?w=800'

    # about
    elements = nassAboutSoup.select(".post-content p")
    about = ""
    s = " \n "
    i = 0
    # shortened it as database would not accept long strings
    for p in elements:
        if i == 2:
            break
        about = about + s + p.text
        i = i + 1
    print "PRINTING ABOUT LEN: "
    print len(about)
    nass = Publication(name="The Nassau Weekly", logo=logo, description=about)
    id = nass.ppost()
    print "nass id: " + str(id)
    nass.addId(id)
    return nass

Example #2

0

Show file

def jsonify_page(urls, topicId, switch="JSON"):
    outlist = list()

    for url in urls:
        soup = sb.getSoup(url)

        title = getTitle(soup)
        author = getAuthor(soup)
        date = getDate(soup)
        imageUrls = getImages(soup)
        body = getBody(soup)
        # now convert to json dict, publication should correspond to nass, topic should be misc
        bornAgain = {
            'title': title,
            'author': author,
            'date': date,
            'body': body,
            'images': imageUrls,
            'url': url,
            'publication': publicationId,
            'topic': topicId,
            "posted": False,
            "id": 0
        }
        outlist.append(bornAgain)

    if switch == "JSON":
        return json.dumps(outlist, sort_keys=True, indent=4)
    else:
        return outlist

Example #3

0

Show file

File: formatDB.py Project: aamirz/333Project_webScraper_fossil

def postTigerMag():
    # get the data from source
    aboutURL = 'http://www.tigermag.com/about-us/'
    aboutSoup = sb.getSoup(aboutURL)

    # the logo
    logo = 'https://upload.wikimedia.org/wikipedia/en/1/15/The_Princeton_Tiger_Logo.png'

    # about
    elements = aboutSoup.select(".hentry-content p")
    about = ""
    s = " "
    # shortened it as database would not accept long strings
    for p in elements:
        about = about + s + p.text

    tigerMag = Publication(name="The Princeton Tiger",
                           logo=logo,
                           description=about)

    #mId = 22
    mId = tigerMag.ppost()
    print "tiger mag id: " + str(mId)
    tigerMag.addId(mId)
    return tigerMag

Example #4

0

Show file

def testUrl(testUrl):
    # only download the page once
    soup = sb.getSoup(testUrl)

    # get the article title, time, author
    title = getTitle(soup)
    sys.stdout.write("Title:\t\t")
    sys.stdout.write(title[0].text)
    writeN()

    author = getAuthor(soup)
    sys.stdout.write("Author:\t\t")
    sys.stdout.write(author[0].text)

    date = getDate(soup)
    sys.stdout.write("\t\tDate:\t\t")
    sys.stdout.write(date)
    writeN()

    # get the body text of our soup
    body = grabPageText(soup)
    # print out the article body
    for p in body:
        sys.stdout.write(p.text)
        writeN()

Example #5

0

Show file

def jsonify_page(urls, topicId, switch="JSON"):
    outlist = list()
    for url in urls:
        # download the page
        soup = sb.getSoup(url)

        # get the page content
        title = titleFormat(sb.listCatch(getTitle(soup)))
        author = sb.listCatch(getAuthor(soup))
        date = getDate(soup)
        # get the image urls
        imageUrls = getImURLS(soup)
        # body comes in list of paragraphs
        body = grabPageText(soup)
        body = getBodyAsString(body)
        if (len(body) == 0):
            body = "/empty"
        # now convert to json dict
        bornAgain = {'title': title, 'author': author,
        'date': date, 'body': body,
        'images': imageUrls, 'url': url,
        'publication': publicationId, 'topic': topicId,
        'posted': False, 'id': 0}
        outlist.append(bornAgain)

    if switch == "JSON":
        return json.dumps(outlist, sort_keys = True, indent = 4)
    else:
        return outlist

Example #6

0

Show file

def getTopicPageUrls(topicPage):
    soup = sb.getSoup(topicPage)
    elements = soup.select(".post a")

    outSet = set()
    for el in elements:
        outSet.add(el["href"])

    return list(outSet)

Example #7

0

Show file

File: formatDB.py Project: aamirz/333Project_webScraper_fossil

def postPrince():
    # get the daily prince info from source
    princeUrl = 'http://www.dailyprincetonian.com/'
    princeSoup = sb.getSoup(princeUrl)
    princeAboutUrl = 'http://www.dailyprincetonian.com/page/about'
    princeAboutSoup = sb.getSoup(princeAboutUrl)

    # for now we are using the old logo because the new one looks nasty
    prince = Publication(
        name="The Daily Princetonian",
        #logo=sb.listCatchItem(princeSoup.select(".col-md-8 a img"))["src"],
        logo=
        'http://dirgyzwl2hnqq.cloudfront.net/20170330XJxw8OoJDm/dist/img/favicons/apple-touch-icon.png',
        description=sb.listCatchItem(
            princeAboutSoup.select(".col-sm-12 p")).text)
    id = prince.ppost()
    print "prince id: " + str(id)
    prince.addId(id)
    return prince

Example #8

0

Show file

def getArchiveIssueLinks(archiveUrl="http://www.nassauweekly.com/issue/"):
    soup = sb.getSoup(archiveUrl)
    elements = soup.select("div h2 a")
    issueUrls = list()
    for el in elements:
        issueUrls.append(el["href"])
    # grab the dates as well!
    elements = soup.select(".post-date")
    dates = list()
    for el in elements:
        dates.append(sb.parseDate(el.text).split(" ")[0])
    return [issueUrls, dates]

Example #9

0

Show file

def getIssueArticleUrls(issueUrl):
    soup = sb.getSoup(issueUrl)
    elements = soup.select(".issue-posts span a")
    i = 0
    urls = list()
    # every other element is an author link
    # so only extract the odd elements
    for el in elements:
        url = el["href"]
        # exclude the author links
        found = re.search("/byline/", url)
        if found is None:
            urls.append(url)
    return urls

Example #10

0

Show file

def getArticleURLS(params, topicTag):
    qURL = getPrinceQURL(params[0], params[1], params[2], topicTag)
    soup = sb.getSoup(qURL)
    links = soup.select(".clearfix a")
    urls = list()
    baseURL = "http://www.dailyprincetonian.com"
    # links are repeated, so we only select even indexes
    for i in range(0, len(links), 2):
        urls.append(links[i]['href'])

    for i in range(0, len(urls)):
        urls[i] = baseURL + urls[i]

    return urls