def main(): source = requests.get('http://goodyfoodies.blogspot.com').text soup = BeautifulSoup(source, 'lxml') posts = [] articles = soup.find_all('div', class_='post hentry uncustomized-post-template') for article in articles: headline = article.h3.text summary = article.find('div', class_='post-body entry-content').text commentLink = article.find('span', class_='post-comment-link').a['href'] commentSource = requests.get(commentLink).text commentSoup = BeautifulSoup(commentSource, 'lxml') comments = commentSoup.find_all('div', class_='comment-block') id = 0 commList = [] for comment in comments: id = id + 1 refid = 1 user = comment.div.cite.text date = comment.find('span', class_='datetime secondary-text').a.text parsed_date = parse(date) # print('Date:', parsed_date.date()) # print('time:', parsed_date.time()) timestamp = datetime.datetime.timestamp(parsed_date) # print(timestamp) msg = comment.p.text comm = Comment(id, refid, timestamp, user, msg) commList.append(comm) posts.append(Post(headline, summary, commList)) for post in posts: print(post)
def main(): source = requests.get('http://journeyofanitaliancook.blogspot.com/').text soup = BeautifulSoup(source, 'lxml') posts = [] articles = soup.find_all('div', class_='post hentry') for article in articles: headline = article.h3.text summary = article.find('div', class_='post-body entry-content').text commentLink = article.find('span', class_='post-comment-link').a['href'] commentSource = requests.get(commentLink).text commentSoup = BeautifulSoup(commentSource, 'lxml') comments = commentSoup.find_all('dl', id='comments-block') id = 0 commList = [] for comment in comments: id = id + 1 refid = 1 user = comment.dt.span.text date = comment.find('p', class_='comment-timestamp').text parsed_date = parse(date) timestamp = datetime.datetime.timestamp(parsed_date) msg = comment.dd.p.text comm = Comment(id, refid, timestamp, user, msg) commList.append(comm) posts.append(Post(headline, summary, commList)) for post in posts: print(post)
def main(): source = requests.get('http://theworldaccordingtoeggface.blogspot.com/').text soup = BeautifulSoup(source, 'lxml') posts = [] articles = soup.find_all('div', class_='post hentry') for article in articles: headline = article.h3.text summary = article.find('div', class_='post-body entry-content').text commentLink = article.h3.a['href'] #este link-ul din headline pentru ca nu exista nicio referinta la comentarii in pagina home commentSource = requests.get(commentLink).text commentSoup = BeautifulSoup(commentSource, 'lxml') # comments = commentSoup.find('div', class_='comments') # nu exista niciun comentariu pe acest blog posts.append(Post(headline, summary, "")) for post in posts: print(post)
def main(): source = requests.get('https://mainlymacro.blogspot.com/').text soup = BeautifulSoup(source, 'lxml') posts = [] articles = soup.find_all('div', class_='post hentry uncustomized-post-template') for article in articles: headline = article.h3.a.text summarySource = requests.get(article.h3.a['href']).text summarySoup = BeautifulSoup(summarySource, 'lxml') summaryList = summarySoup.find( 'div', class_='post-body entry-content').find_all('span') summary = "" for s in summaryList: summary = summary + " " + s.text.replace('\n', '').replace( '\t', '') try: comments = summarySoup.find('ol', id='top-ra').find_all('li') except Exception as e: comments = [] id = 0 commList = [] anonymous = 1 for comment in comments: id = id + 1 refid = 1 user = comment.find('div', class_='comment-header').cite.text.replace( '\n', '').replace('\t', '') if user == 'Unknown' or user == 'Anonymous': user = '******' + str(anonymous) anonymous = anonymous + 1 date = comment.find('span', class_='datetime secondary-text').text.replace( '\n', '').replace('\t', '') parsed_date = parse(date) timestamp = datetime.datetime.timestamp(parsed_date) msg = comment.find('p', class_='comment-content').text.replace( '\n', '').replace('\t', '') comm = Comment(id, refid, timestamp, user, msg) commList.append(comm) fail_condition = True while fail_condition: try: replies = comment.find( 'div', class_='comment-replies').find_all('li') refid = id for reply in replies: id = id + 1 user = reply.find( 'div', class_='comment-header').cite.text.replace( '\n', '').replace('\t', '') if user == 'Unknown' or user == 'Anonymous': user = '******' + str(anonymous) anonymous = anonymous + 1 date = reply.find( 'span', class_='datetime secondary-text').text.replace( '\n', '').replace('\t', '') parsed_date = parse(date) timestamp = datetime.datetime.timestamp(parsed_date) msg = reply.find( 'p', class_='comment-content').text.replace( '\n', '').replace('\t', '') comm = Comment(id, refid, timestamp, user, msg) commList.append(comm) fail_condition = False except Exception as e: fail_condition = False posts.append(Post(headline, summary, commList)) for post in posts: print(post)
def main(): source = requests.get('https://www.honeywerehome.com/').text soup = BeautifulSoup(source, 'lxml') posts = [] articles = soup.find_all('div', class_='entry-summary') for article in articles: headline = article.div.a['title'] summaryList = article.find_all('p') summary = "" for s in summaryList: summary = summary + " " + s.text commentLink = article.a['href'] commentSource = requests.get(commentLink).text commentSoup = BeautifulSoup(commentSource, 'lxml') comments = commentSoup.find('ol', class_='comment-list').find_all('li') id = 0 commList = [] for comment in comments: id = id + 1 refid = 1 user = comment.find('div', class_='comment-author').cite.text date = comment.find('span', class_='comment-date').text parsed_date = parse(date) timestamp = datetime.datetime.timestamp(parsed_date) msg = comment.find('div', class_='comment-content').p.text comm = Comment(id, refid, timestamp, user, msg) commList.append(comm) fail_condition = True while fail_condition: try: replies = comment.find('ul', class_='children').find_all('li') refid = id for reply in replies: id = id + 1 user = reply.find('div', class_='comment-author').cite.text date = reply.find('span', class_='comment-date').text parsed_date = parse(date) timestamp = datetime.datetime.timestamp(parsed_date) msg = reply.find('div', class_='comment-content').p.text print(user) comm = Comment(id, refid, timestamp, user, msg) commList.append(comm) fail_condition = False except Exception as e: fail_condition = False posts.append(Post(headline, summary, commList)) for post in posts: print(post)
def main(): source = requests.get('https://thinkmarkets.wordpress.com').text soup = BeautifulSoup(source, 'lxml') posts = [] articles = soup.find_all('article') for article in articles: headline = article.header.h1.a.text summarySource = requests.get(article.header.h1.a['href']).text summarySoup = BeautifulSoup(summarySource, 'lxml') summaryList = summarySoup.find('div', class_='entry-content').find_all('p') summary = "" for s in summaryList: summary = summary + " " + s.text.replace('\n', '').replace( '\t', '') try: comments = summarySoup.find('ol', class_='comment-list').find_all('li') except Exception as e: comments = [] id = 0 commList = [] anonymous = 1 for comment in comments: id = id + 1 refid = 1 user = comment.find('div', class_='comment-author vcard').b.text.replace( '\n', '').replace('\t', '') if user == '': user = comment.find( 'div', class_='comment-author vcard').b.a.text.replace( '\n', '').replace('\t', '') if user == 'Unknown' or user == 'Anonymous': user = '******' + str(anonymous) anonymous = anonymous + 1 date = comment.find('div', class_='comment-metadata').a.time.text.replace( '\n', '').replace('\t', '') parsed_date = parse(date) timestamp = datetime.datetime.timestamp(parsed_date) msg = "" msgList = comment.find('div', class_='comment-content').find_all('p') for m in msgList: msg = msg + m.text.replace('\n', '').replace('\t', '') comm = Comment(id, refid, timestamp, user, msg) commList.append(comm) fail_condition = True while fail_condition: try: replies = comment.find('ul', class_='children').find_all('li') refid = id for reply in replies: id = id + 1 user = reply.find( 'div', class_='comment-author vcard').b.text.replace( '\n', '').replace('\t', '') if user == '': user = reply.find('div', class_='comment-author vcard' ).b.a.text.replace('\n', '').replace( '\t', '') if user == 'Unknown' or user == 'Anonymous': user = '******' + str(anonymous) anonymous = anonymous + 1 date = reply.find( 'div', class_='comment-metadata').a.time.text.replace( '\n', '').replace('\t', '') parsed_date = parse(date) timestamp = datetime.datetime.timestamp(parsed_date) msg = "" msgList = reply.find( 'div', class_='comment-content').find_all('p') for m in msgList: msg = msg + m.text.replace('\n', '').replace( '\t', '') comm = Comment(id, refid, timestamp, user, msg) commList.append(comm) fail_condition = False except Exception as e: fail_condition = False posts.append(Post(headline, summary, commList)) for post in posts: print(post)
def main(): source = requests.get('http://warnewsupdates.blogspot.com/').text soup = BeautifulSoup(source, 'lxml') posts = [] articles = soup.find('div', class_='widget Blog').find_all('div', class_='post hentry') for article in articles: headline = article.h3.a.text summarySource = requests.get(article.h3.a['href']).text summarySoup = BeautifulSoup(summarySource, 'lxml') summary = summarySoup.find('div', class_='post-body entry-content').text summary = summary.replace('\n', "") # print(summary) try: comments = summarySoup.find('ol', class_='commentlist').find_all('li') except Exception as e: comments = [] id = 0 commList = [] anonymous = 1 for comment in comments: id = id + 1 refid = 1 user = comment.find('div', class_='comment-author vcard').cite.text if user == 'Unknown' or user == 'Anonymous': user = '******' + str(anonymous) anonymous = anonymous + 1 date = comment.find('div', class_='comment-meta commentmetadata').a.text parsed_date = parse(date) timestamp = datetime.datetime.timestamp(parsed_date) msg = "" msgList = comment.find('div', class_='comment-body').find_all('p') for m in msgList: if 'Like' not in m.text: msg = msg + m.text + " " comm = Comment(id, refid, timestamp, user, msg) commList.append(comm) fail_condition = True while fail_condition: try: replies = comment.find('ul', class_='children').find_all('li') refid = id for reply in replies: id = id + 1 user = reply.find( 'div', class_='comment-author vcard').cite.text if user == 'Unknown' or user == 'Anonymous': user = '******' + str(anonymous) anonymous = anonymous + 1 date = reply.find( 'div', class_='comment-meta commentmetadata').a.text parsed_date = parse(date) timestamp = datetime.datetime.timestamp(parsed_date) msg = "" msgList = reply.find( 'div', class_='comment-body').find_all('p') for m in msgList: if m.text != 'Like': if 'Like' not in m.text: msg = msg + m.text + " " comm = Comment(id, refid, timestamp, user, msg) commList.append(comm) fail_condition = False except Exception as e: fail_condition = False posts.append(Post(headline, summary, commList)) for post in posts: print(post)
def main(): source = requests.get('http://2politicaljunkies.blogspot.com/').text soup = BeautifulSoup(source, 'lxml') posts = [] articles = soup.find_all('div', class_='post hentry') for article in articles: headline = article.h3.a.text summarySource = requests.get(article.h3.a['href']).text summarySoup = BeautifulSoup(summarySource, 'lxml') summary = summarySoup.find( 'div', class_='post-body entry-content').text.replace('\n', '') try: commentUsers = summarySoup.find('div', id='comments').find_all( 'dt', class_='comment-author') commentBodies = summarySoup.find('div', id='comments').find_all( 'dd', class_='comment-body') commentFooters = summarySoup.find('div', id='comments').find_all( 'dd', class_='comment-footer') except Exception as e: comments = [] id = 0 commList = [] anonymous = 1 for comment in commentBodies: id = id + 1 refid = 1 commentUser = commentUsers.pop(0) user = commentUser.text.replace('said...', '').replace('\n', '') if user == 'Unknown' or user == 'Anonymous': user = '******' + str(anonymous) anonymous = anonymous + 1 commentFooter = commentFooters.pop(0) date = commentFooter.find('span', class_='comment-timestamp').a.text parsed_date = parse(date) timestamp = datetime.datetime.timestamp(parsed_date) try: msg = comment.find('p').text.replace('\n', ' ') except Exception as e: msg = '' comm = Comment(id, refid, timestamp, user, msg) commList.append(comm) posts.append(Post(headline, summary, commList)) for post in posts: print(post)
def main(): source = requests.get('http://inspiringscience.net').text soup = BeautifulSoup(source, 'lxml') posts = [] articles = soup.find('div', id='content').find_all('article') for article in articles: headline = article.header.h1.a.text summarySource = requests.get(article.header.h1.a['href']).text summarySoup = BeautifulSoup(summarySource, 'lxml') summaryList = summarySoup.find('div', class_='post-entry').find_all('p') summary = "" for s in summaryList: summary = summary + " " + s.text try: comments = summarySoup.find('ol', class_='commentlist').find_all('li') except Exception as e: comments = [] id = 0 commList = [] anonymous = 1 for comment in comments: id = id + 1 refid = 1 user = comment.find('p', class_='comment-author').span.text if user == 'Unknown' or user == 'Anonymous': user = '******' + str(anonymous) anonymous = anonymous + 1 date = comment.find('p', class_='comment-date').a.time.text parsed_date = parse(date) timestamp = datetime.datetime.timestamp(parsed_date) msg = '' msgList = comment.find('div', class_='comment-text').find_all('p') del msgList[-1] for m in msgList: msg = msg + m.text + " " comm = Comment(id, refid, timestamp, user, msg) commList.append(comm) fail_condition = True while fail_condition: try: replies = comment.find('ul', class_='children').find_all('li') refid = id for reply in replies: id = id + 1 user = reply.find('p', class_='comment-author').span.text if user == 'Unknown' or user == 'Anonymous': user = '******' + str(anonymous) anonymous = anonymous + 1 date = reply.find('p', class_='comment-date').a.time.text parsed_date = parse(date) timestamp = datetime.datetime.timestamp(parsed_date) msg = '' msgList = reply.find( 'div', class_='comment-text').find_all('p') del msgList[-1] for m in msgList: msg = msg + m.text + " " comm = Comment(id, refid, timestamp, user, msg) commList.append(comm) fail_condition = False except Exception as e: fail_condition = False posts.append(Post(headline, summary, commList)) for post in posts: print(post)
def main(): source = requests.get('https://fullymyelinated.wordpress.com/').text soup = BeautifulSoup(source, 'lxml') posts = [] articles = soup.find('div', id='content-left').find_all('div', class_='entry') for article in articles: headline = article.h2.a.text summarySource = requests.get(article.h2.a['href']).text summarySoup = BeautifulSoup(summarySource, 'lxml') summary = "" summaryList = summarySoup.find('div', class_='entry').find_all('p') summaryList.pop(0) #remove 'leave a comment' text for s in summaryList: summary = summary + s.text.replace('\n', '') try: comments = summarySoup.find( 'ol', class_='commentlist snap_preview').find_all('li') except Exception as e: comments = [] id = 0 commList = [] anonymous = 1 for comment in comments: id = id + 1 refid = 1 try: user = comment.find('div', class_='comment-author vcard').cite.text if user == 'Unknown' or user == 'Anonymous': user = '******' + str(anonymous) anonymous = anonymous + 1 date = comment.find( 'div', class_='comment-meta commentmetadata').a.text parsed_date = parse(date) timestamp = datetime.datetime.timestamp(parsed_date) msg = "" msgList = comment.find('div', class_='comment-body').find_all('p') for m in msgList: msg = msg + m.text.replace('\n', '') + ' ' comm = Comment(id, refid, timestamp, user, msg) commList.append(comm) except Exception as e: comm = None fail_condition = True while fail_condition: try: replies = comment.find('ul', class_='children').find_all('li') refid = id for reply in replies: id = id + 1 user = reply.find( 'div', class_='comment-author vcard').cite.text if user == 'Unknown' or user == 'Anonymous': user = '******' + str(anonymous) anonymous = anonymous + 1 date = reply.find( 'div', class_='comment-meta commentmetadata').a.text parsed_date = parse(date) timestamp = datetime.datetime.timestamp(parsed_date) msg = "" msgList = reply.find( 'div', class_='comment-body').find_all('p') for m in msgList: msg = msg + m.text.replace('\n', '') + ' ' comm = Comment(id, refid, timestamp, user, msg) commList.append(comm) fail_condition = False except Exception as e: fail_condition = False posts.append(Post(headline, summary, commList)) for post in posts: print(post)