Exemple #1
0
def scrapeArticle(URL):
    bText = ""
    sList = []
    soup = BeautifulSoup(urllib.urlopen(URL))
    mainStory = BeautifulSoup(str(soup.find("div", attrs={"class": "entry"})))
    for paragraph in mainStory.find_all("p"):
        if paragraph.has_attr("class") != True and paragraph.get_text().strip() != "":
            sList.append(" ".join(paragraph.get_text().replace("CBS", "Source").strip().split()))
    if len(sList) <= 1:
        return None
    for sentence in sList:
        bText += sentence.strip() + ("\n\n")
    date = checkElement(soup.find("span", attrs={"class": "time"}), "date")

    ArticleDict = {
        "title": checkElement(soup.find("h1", attrs={"class": "title"}), "title"),
        "author": checkElement(soup.find("span", attrs={"class": "author"}), "author"),
        "body_text": bText[:-3],
        "URL": URL,
        "date": date,
        "timestamp": getTime(
            date.replace("AM", " AM ").replace("PM", " PM ").strip(), [",", ":"], [], "%B %d %Y %I %M %p"
        ),
    }
    source = soup.find("span", attrs={"class": "source"})
    if source != None:
        ArticleDict["source"] = source.get_text()
    else:
        ArticleDict["source"] = "CBS News"
    image = soup.find("div", attrs={"class": "article-image"})
    if image != None and image.img != None:
        ArticleDict["image"] = image.img.get("src")
    return ArticleDict
Exemple #2
0
def scrapeArticle(URL):
    #Indexes title, date, body, and author of given article
    #returns values in a dictionary
    bText = ''
    soup = BeautifulSoup(urllib.urlopen(URL))
    linkList = map(lambda x: x.get_text(), soup.find_all('a'))
    pageStatus = soup.find('div', attrs = {'class':'singlepage'})
    if pageStatus != None:
        soup = BeautifulSoup(urllib.urlopen('http://abcnews.go.com' + pageStatus.a.get('href')))
    for paragraph in soup.find_all('p', attrs = {'itemprop':'articleBody'}):
        if paragraph.a not in linkList:
            bText += paragraph.get_text()
    date = checkElement(soup.find('div', attrs = {'class':'date'}), 'date')
    image = soup.find('div', attrs = {'class' : 'main_media'})
    ArticleDict = {'title' : checkElement(soup.find('h1', True), 'title'),
                   'author' : " ".join(checkElement(
                       soup.find('div', attrs = {'class':'byline'}), 'author')
                                       .split('\n\n')[0].lower().title().split())
                                       .replace('And', 'and').replace('Abc', 'Source'),
                   'body_text' : bText.replace('\n', '\n\n').strip(),
                   'URL' : URL,
                   'source' : 'ABC News',
                   'date' : date,
                   'timestamp' : getTime(date, [',',':','.'], [], '%b %d %Y')}                  
    if image != None and image.img != None:
        if image.img.get('src') != None:
            ArticleDict['image'] = image.img.get('src')
    return ArticleDict
Exemple #3
0
def scrapeArticle(URL):
    paraList = []
    ArticleDict = {}
    soup = BeautifulSoup(urllib.urlopen(URL))
    rawList = soup.prettify().split('\n')
    for cell in rawList:
        if 'published_time' in cell:
            cell = cell[cell.find('content=') + 9:]
            cell = cell[:cell.find('"')]
            p = cell.split('-')
            ArticleDict['date'] = p[1] + '/' + p[2] + '/' + p[0]
        else:
            ArticleDict['date'] = 'Unknown'
    ArticleDict['timestamp'] = getTime(ArticleDict['date'], [','], ['/'],
                                       '%m %d %Y')
    soup2 = BeautifulSoup(str(soup.find('div', attrs={'class': 'body'})))
    for line in soup2.find_all('p'):
        if line.has_attr('class') == False:
            paraList.append(line.get_text())
    ArticleDict['author'] = checkElement(soup.find('p'), 'author')
    ArticleDict['body_text'] = '\n\n'.join(paraList[:-2])
    ArticleDict['title'] = checkElement(soup.find('h1'), 'title')
    ArticleDict['source'] = 'Forbes'
    ArticleDict['URL'] = URL
    return ArticleDict
Exemple #4
0
def scrapeArticle(URL):
    bText = ''
    sList = []
    soup = BeautifulSoup(urllib.urlopen(URL))
    mainStory = BeautifulSoup(str(soup.find('div', attrs = {'class': 'entry'})))
    for paragraph in mainStory.find_all('p'):
        if paragraph.has_attr('class') != True and paragraph.get_text().strip() != '':           
            sList.append(' '.join(paragraph.get_text()
                                  .replace('CBS', 'Source').strip().split()))
    if len(sList) <= 1:
        return None
    for sentence in sList:
        bText += sentence.strip() + ('\n\n')
    date = checkElement(soup.find('span', attrs = {'class': 'time'}), 'date')
    
    ArticleDict = {'title' : checkElement(soup.find('h1', attrs = {'class': 'title'}), 'title'),
                   'author' : checkElement(soup.find('span', attrs = {'class':'author'}), 'author'),
                   'body_text' : bText[:-3],
                   'URL' : URL,
                   'date' : date,
                   'timestamp' : getTime(date.replace('AM', ' AM ').replace('PM', ' PM ').strip(),
                                         [',',':'], [], '%B %d %Y %I %M %p')}                  
    source = soup.find('span', attrs = {'class': 'source'})
    if source != None:
        ArticleDict['source'] = source.get_text()
    else:
        ArticleDict['source'] = 'CBS News'
    image = soup.find('div', attrs = {'class':'article-image'})
    if image != None and image.img != None:
        ArticleDict['image'] = image.img.get('src')
    return ArticleDict
def scrapeArticle(URL):
    #Goes through an article indexes the title, date, body text and author if avaliable
    #Does not work on galleries and video
    #Strips name Aljazeera from body text
    inText = ''
    curImage = ''
    soup = BeautifulSoup(urllib.urlopen(URL))
    for section in soup.find_all('div', attrs={'class': 'text section'}):
        bodySoup = BeautifulSoup(str(section))
        for paragraph in bodySoup.find_all('p'):
            inText += paragraph.get_text().encode('utf-8').strip() + '\n\n'
    ArticleDict = {
        'title':
        checkElement(
            soup.find('div',
                      attrs={'class': 'articleOpinion-title--container'}),
            'title'),
        'author':
        ' '.join(
            checkElement(
                soup.find('span', attrs={'class': 'articleOpinion-byline'}),
                'author').split()),
        'body_text':
        inText,
        'source':
        'Al Jazeera',
        'URL':
        URL
    }
    date = soup.find('span', attrs={'class': 'date'})
    time = soup.find('span', attrs={'class': 'time'})
    if date != None:
        if time != None:
            ArticleDict['date'] = (date.get_text() + ' ' +
                                   time.get_text()).encode('utf-8')
            ArticleDict['timestamp'] = getTime(
                ArticleDict['date'].replace('AM',
                                            ' AM ').replace('PM', ' PM '),
                [','], [':'], '%B %d %Y %I %M %p %Z')
        else:
            ArticleDict['date'] = date.get_text().encode('utf-8')
            ArticleDict['timestamp'] = getTime(
                ArticleDict['date'].replace('AM',
                                            ' AM ').replace('PM', ' PM '),
                [','], [':'], '%B %d %Y')
    else:
        ArticleDict['date'] = 'Unknown'
        ArticleDict['timestamp'] = datetime.datetime.now()

    imageList = soup.prettify().split('\n')
    for cell in imageList:
        if 'background-image' in cell and '1460' in cell:
            curImage = cell
    curImage = curImage[(curImage.find("'") + 1):]
    curImage = curImage[:curImage.find("'")]
    if curImage.strip() != '':
        ArticleDict['image'] = 'http://america.aljazeera.com' + curImage
    return ArticleDict
Exemple #6
0
def scrapeArticle(URL):
    #scrapes CNN article
    #removes links that are not part of the main article
    #returns author, last updated date, title, image link, and the body text in a dictionary
    soup = BeautifulSoup(urllib.urlopen(URL))
    linkList = map(lambda x: x.get_text(), soup.find_all('a'))
    inText = ''
    for para in soup.find_all('p', attrs = {'class': 'zn-body__paragraph'}):
        if para.get_text() not in linkList:
            if para.q == None and para.em == None:
                inText += (para.get_text().strip() + '\n\n')
    date = checkElement(soup.find('div', attrs = {'class': 'cnn_strytmstmp'}), 'date')
    ArticleDict =  {'title' : checkElement(soup.find('h2', attrs = {'class': 'pg-headline'}), 'title'),
                    'author' : checkElement(
                            soup.find('span', attrs = {'class': 'metadata__byline__author'}), 'author'),
                    'source' : 'CNN',
                    'body_text' : inText,
                    'date' : date,
                    'URL' : URL,
                    'timestamp' : getTime('0' + date.replace('Update', ''),
                                          [','] ,[':'],
                                          '%I %M %p %Z %a %B %d %Y')}
    image = soup.find('div', attrs = {'class' : 'cnn_stryimg640captioned'})
    if image != None:
        image = BeautifulSoup(str(image)).prettify()
        for cell in image.split(' '):
            if 'src' in cell:
               ArticleDict['image'] = cell[cell.find('http'):].replace('"', '')
    elif soup.find('div', attrs = {'class':'cnnStryVidCont'}) != None:
        image = soup.prettify().split('\n')
        for cell in image:
            if 'thumb:' in cell:
                image = cell[cell.find('http'):]
        if image != None:
            ArticleDict['image'] = image[:-1*(len(image) - image.find("'"))]
    else:
        image = soup.find('div', attrs = {'class' : 'cnnArticleGalleryPhotoContainer'})
        if image != None:
            image = BeautifulSoup(str(image)).prettify().split(' ')
            for cell in image:
                if 'src' in cell:
                   ArticleDict['image'] = cell[cell.find('http'):].replace('"', '')
    return ArticleDict
def scrapeArticle(URL):
    #Goes through an article indexes the title, date, body text and author if avaliable
    #Does not work on galleries and video
    #Strips name Aljazeera from body text
    inText = ''
    curImage = ''
    soup = BeautifulSoup(urllib.urlopen(URL))
    for section in soup.find_all('div', attrs = {'class':'text section'}):
        bodySoup = BeautifulSoup(str(section))
        for paragraph in bodySoup.find_all('p'):
            inText += paragraph.get_text().encode('utf-8').strip() + '\n\n'
    ArticleDict = {'title' : checkElement(
                                soup.find('div', attrs = {'class':'articleOpinion-title--container'}),'title'),
                   'author' : ' '.join(checkElement(
                                soup.find('span', attrs = {'class':'articleOpinion-byline'}), 'author').split()),
                   'body_text' : inText,
                   'source' : 'Al Jazeera',
                   'URL' : URL}
    date = soup.find('span', attrs = {'class':'date'})
    time = soup.find('span', attrs = {'class':'time'})
    if date != None:
        if time != None:
            ArticleDict['date'] = (date.get_text() + ' ' + time.get_text()).encode('utf-8')
            ArticleDict['timestamp'] = getTime(ArticleDict['date'].replace('AM', ' AM ').replace('PM', ' PM '),
                                           [','], [':'], '%B %d %Y %I %M %p %Z')
        else:
            ArticleDict['date'] = date.get_text().encode('utf-8')
            ArticleDict['timestamp'] = getTime(ArticleDict['date'].replace('AM', ' AM ').replace('PM', ' PM '),
                                           [','], [':'], '%B %d %Y')
    else:
        ArticleDict['date'] = 'Unknown'
        ArticleDict['timestamp'] = datetime.datetime.now()
            
    imageList = soup.prettify().split('\n')
    for cell in imageList:
        if 'background-image' in cell and '1460' in cell:
            curImage = cell
    curImage = curImage[(curImage.find("'") + 1):]
    curImage = curImage[:curImage.find("'")]
    if curImage.strip() != '':
        ArticleDict['image'] = 'http://america.aljazeera.com' + curImage
    return ArticleDict
Exemple #8
0
def scrapeArticle(URL):
    #Indexes title, date, body, and author of given article
    #returns values in a dictionary
    bText = ''
    soup = BeautifulSoup(urllib.urlopen(URL))
    linkList = map(lambda x: x.get_text(), soup.find_all('a'))
    pageStatus = soup.find('div', attrs={'class': 'singlepage'})
    if pageStatus != None:
        soup = BeautifulSoup(
            urllib.urlopen('http://abcnews.go.com' + pageStatus.a.get('href')))
    for paragraph in soup.find_all('p', attrs={'itemprop': 'articleBody'}):
        if paragraph.a not in linkList:
            bText += paragraph.get_text()
    date = checkElement(soup.find('div', attrs={'class': 'date'}), 'date')
    image = soup.find('div', attrs={'class': 'main_media'})
    ArticleDict = {
        'title':
        checkElement(soup.find('h1', True), 'title'),
        'author':
        " ".join(
            checkElement(
                soup.find('div', attrs={'class': 'byline'}),
                'author').split('\n\n')[0].lower().title().split()).replace(
                    'And', 'and').replace('Abc', 'Source'),
        'body_text':
        bText.replace('\n', '\n\n').strip(),
        'URL':
        URL,
        'source':
        'ABC News',
        'date':
        date,
        'timestamp':
        getTime(date, [',', ':', '.'], [], '%b %d %Y')
    }
    if image != None and image.img != None:
        if image.img.get('src') != None:
            ArticleDict['image'] = image.img.get('src')
    return ArticleDict
def scrapeArticle(URL):
    #Scrapes article at given url
    #Returns title, author, source, and body_text in dictionary
    bText = ''
    soup = BeautifulSoup(urllib.urlopen(URL))
    date_source = ' '.join(soup.find('span', attrs = {'class' : 'source'}).get_text().split())[1:].split('-')
    linkList = map(lambda x: x.get_text(), soup.find_all('a'))
    articleDict ={
        'title' :  checkElement(soup.find('h1', attrs = {'class' : 'page-headline'}), 'title'),
         'author' : checkElement(soup.find('span', attrs = {'class':'byline'}), 'author'),
         'source' : date_source[0].strip(),
         'date' : date_source[1].strip(),
         'timestamp' : getTime(date_source[1].strip(), [','], [], '%A %B %d %Y'),
         'URL' : URL}
    body_soup = BeautifulSoup(str(soup.find('div', attrs = {'class' : 'article-text'})))
    for para in body_soup.find_all('p'):
        if para.get_text() not in linkList:
            bText += para.get_text().strip() + '\n\n'
    articleDict['body_text'] = bText
    image = soup.find('div', attrs = {'class' : 'photo'})
    if image != None and image.img != None:
        if image.img.get('src') != None:
            articleDict['image'] = image.img.get('src')
    return articleDict
Exemple #10
0
def scrapeArticle(URL):
    paraList = []
    ArticleDict = {}
    soup = BeautifulSoup(urllib.urlopen(URL))
    rawList = soup.prettify().split('\n')
    for cell in rawList:
        if 'published_time' in cell:
            cell = cell[cell.find('content=') + 9:]
            cell = cell[:cell.find('"')]
            p = cell.split('-')
            ArticleDict['date'] = p[1] + '/' + p[2] + '/' + p[0]
        else:
            ArticleDict['date'] = 'Unknown'
    ArticleDict['timestamp'] = getTime(ArticleDict['date'], [','], ['/'], '%m %d %Y')
    soup2 = BeautifulSoup(str(soup.find('div', attrs = {'class':'body'})))
    for line in soup2.find_all('p'):
        if line.has_attr('class') == False:
            paraList.append(line.get_text())
    ArticleDict['author'] = checkElement(soup.find('p'),'author')
    ArticleDict['body_text'] = '\n\n'.join(paraList[:-2])
    ArticleDict['title'] = checkElement(soup.find('h1'), 'title')
    ArticleDict['source'] = 'Forbes'
    ArticleDict['URL'] = URL
    return ArticleDict
Exemple #11
0
def scrapeMoney(URL):
    #scrapes CNN Money article
    #returns author, last updated date, title, and body text in a dictionary
    #strips the links that are not part of the main article
    soup = BeautifulSoup(urllib.urlopen(URL))
    image = soup.find('div', attrs = {'id': 'ie_dottop'})
    linkList = map(lambda x: x.get_text(), soup.find_all('a'))
    body_soup = BeautifulSoup(str(soup.find('div', attrs = {'id': 'storytext'})))
    date = checkElement(soup.find('span', attrs = {'class' : 'cnnDateStamp'}), 'date')
    inText = checkElement(soup.find('h2'), 'header') + '\n\n' 
    for paragraph in body_soup.find_all('p')[1:]:
        if paragraph.find('span') == None:
            if paragraph.a != None:
                if paragraph.a.get_text() != linkList:
                    inText += paragraph.get_text()[1:] + '\n\n'
            else:
                inText += paragraph.get_text()[1:] + '\n\n'
    in_soup = BeautifulSoup(str(soup.find('div', attrs = {'id': 'storycontent'})))
    moneyDict = {'title' : checkElement(in_soup.find('h1'), 'title'),
                 'author' : checkElement(
                     soup.find('span', attrs = {'class' : 'byline'}), 'author'),
                 'date' : date,
                 'source' : 'CNN Money',
                 'body_text' : inText,
                 'URL' : URL,
                 'timestamp' : getTime(date, [','] ,[':'],
                                       '%B %d %Y %I %M %p %Z')}
    if image != None and image.img != None:
        src = image.img.get('src')
        if src != None:
            moneyDict['image'] = image.img.get('src')
    else:
        img2 = soup.find('figure', attrs = {'class': 'body_img body_img--620'})
        if img2 != None:
            moneyDict['image'] = img2.img.get('src')
    return moneyDict