def _get_content(html): articleDescription = html.find('p', {'class': 'description'}) articleDescription = clean_html(str(articleDescription)) articleContent = html.find('div', {'class': 'story-content'}) articleContent = clean_html(str(articleContent)) return articleDescription + articleContent
def _get_date(html): # Finds the author and posted date class. authorName = html.findAll('p', {'class': 'author-name'}) timestamp_resultset = str(authorName) # Converts to BS object to find span class where posted. # Date is present. tS = BeautifulSoup(timestamp_resultset, 'lxml') tS = tS.find_all('span') # Maps and converts to raw string data. raw_content_str = map(str, tS) date = clean_html(' '.join(raw_content_str)) if not date: date = html.find('div', {'class': 'publishDate'}) date = clean_html(' '.join(date)) date = datetime.datetime.strptime(date, "%d %B %Y").strftime("%Y-%m-%d") return date
def _get_date(html): date = html.find('time', {'class': 'date-mdy'}) #format is mm.dd.yy dates = str(date).split('.') #reformat month = dates[0] day = dates[1] year = "20" + dates[2] publish_date = year + "-" + month + "-" + day return clean_html(str(publish_date))
def _get_content(html): articleText = html.find('article') articleText = str(articleText) articleSoup = BeautifulSoup(articleText, 'lxml') articleSoup = articleSoup.find('div') articleSoup = articleSoup.findAll('p') raw_content_str = map(str, articleSoup) return clean_html(' '.join(raw_content_str))
def _get_content(html): articleText = html.find('div', {'class': 'article-content'}) articleText = str(articleText) articleSoup = BeautifulSoup(articleText, 'lxml') articleSoup = articleSoup.findAll('p') #print(str(articleSoup)) raw_content_str = map(str, articleSoup) finalReturn = clean_html(' '.join(raw_content_str)) return finalReturn
def _get_content(html): articleText = html.find('div', {'class': 'article-text-update'}) if not articleText: articleText = html.find('div', {'class': 'article-text text-merri'}) articleText = str(articleText) articleSoup = BeautifulSoup(articleText, 'lxml') articleSoup = articleSoup.findAll('p') raw_content_str = map(str, articleSoup) return clean_html(' '.join(raw_content_str))
def _get_content(html): articleText = html.findAll('div', {'class': 'section-content'}) cleanText = clean_html(' '.join(map(str, articleText))) return cleanText
def _get_date(html): date = html.find('time', {'class': 'the-time'}) date = clean_html(str(date)) return date
def _get_title(html): title = html.find('h1', {'class': 'article-title'}) title = clean_html(str(title)) return title
def _get_content(html): raw_content_str = map(str, html.select('.text')[0].contents) return clean_html(' '.join(raw_content_str))
def _get_title(html): title = html.find('h1', {'class': 'title'}) return clean_html(str(title))
def _get_title(html): title = html.find('h1', {'class': 'story-headline'}) return clean_html(str(title))