def get_HTML_article(url_opener, article_file, article_url): 'Get URL HTML' print ("Getting HTML article from URL: " + article_url) html_response=url_opener.open(article_url) 'Build HTML parser' soup = BeautifulSoup(html_response) 'Get the Author' article_author_obj=soup.find('a', attrs={"rel": "author"}) if (article_author_obj != None): article_author= article_author_obj.contents author = str(article_author[0]) author_stripped = Scraper.string_cleaner(author) else : author_stripped = "Unknown" article_file.write("<author>" + author_stripped + '</author>\n\n') 'Get the Author' article_body=soup.findAll('article') 'Get all paragraphs + clean redundant chars' article_file.write("<content>" + "\n") try: for article in article_body: for paragraph in article.findAll('p'): stripped_p = Scraper.string_cleaner(paragraph) article_file.write(stripped_p + "\n") except: return False article_file.write("</content>" + "\n") return True 'Get next page - Currently disabled '
def get_HTML_article(url_opener, article_file, article_url): 'Get URL HTML' print ("Getting HTML article from URL: " + article_url) html_response=url_opener.open(article_url) 'Build HTML parser' soup = BeautifulSoup(html_response) 'Get the Author' article_author_obj=soup.find('span', attrs={"itemprop": "name"}) if (article_author_obj != None): article_author= article_author_obj.contents author_to_parse = article_author[0].split(",", 1) author = re.sub(r'\\n', '', str(author_to_parse[0])).strip() author_stripped = Scraper.string_cleaner(author) else : author_stripped = "Unknown" article_file.write("<author>" + author_stripped +'</author>\n\n') 'Get The Article body' article_body=soup.find(attrs={"itemprop": "articleBody"}) 'Get all paragraphs + clean redundant chars' article_file.write("<content>" + "\n") try: for paragraph in article_body.findAll('p'): stripped_p = Scraper.string_cleaner(paragraph) article_file.write(stripped_p + "\n") except: return False article_file.write("</content>" + "\n") return True 'Get next page - Currently disabled '
for result in results_obj: try: 'Get ID' try: res_id = result['data-fullname'] except: continue 'Get Entry' entry_obj = result.find('div', attrs={"class": "entry unvoted"}) 'Get title' title_obj = (entry_obj.find('p', attrs={"class": "title"})).find('a', attrs={"class": "title "}) title_parsed = title_obj.contents url= title_obj['href'] title = Scraper.string_cleaner(str(title_parsed[0])) 'Get Domain' domain_obj = entry_obj.find('p', attrs={"class": "title"}) span_obj = domain_obj.find('span', attrs={"class": "domain"}) domain_parsed = span_obj.find('a').contents domain = Scraper.string_cleaner(str(domain_parsed[0])) 'Subreddit' tagline = entry_obj.find('p', attrs={"class": "tagline"}) hover = tagline.find('a', attrs={"class": "subreddit hover"}) subredd_parsed = hover.contents subredd = Scraper.string_cleaner(str(subredd_parsed[0])) subredd = subredd[:-1] 'Score'
' Iterate the submissions ' for sub in submissions: try: 'Accept only posts with comments' if (sub.score < VOTE_TRESHHOLD): continue 'Open File with article id as the name' article_file = open(str(sub_reddit) + "\\" + str(article_id), 'w+', newline="\n") article_file.write("<article>\n") article_file.write("<sub-reddit>" + sub_reddit + "</sub-reddit>\n") article_file.write("<news-paper>" + sub.domain + "</news-paper>\n") article_file.write("\n") stripped_title = Scraper.string_cleaner(sub.title) article_file.write("<title>" + stripped_title + "</title>\n") 'Get the article content' if (SUPPORTED_NEWS_SITES[0] in sub.domain): success = Scraper.ny_times.get_HTML_article(url_opener, article_file, sub.url) elif (SUPPORTED_NEWS_SITES[1] in sub.domain): success = Scraper.usa_today.get_HTML_article(url_opener, article_file, sub.url) elif (SUPPORTED_NEWS_SITES[2] in sub.domain): success = Scraper.washington_post.get_HTML_article(url_opener, article_file, sub.url) else: success = False 'Close the XML file' article_file.write("</article>\n") 'Found articles counter'