Esempio n. 1
0
	def __init__ (self,conn, HTML_page_Obj=None, article_file_path=None):
		if HTML_page_Obj==None and article_file_path!=None:
			make_from_file(article_file_path)
		elif HTML_page_Obj!=None and article_file_path==None:
			self.article_url=HTML_page_Obj.url
			website_base_url=extraction_text_manip.extract_website(self.article_url)
			print "\n Making article found on %s"%website_base_url
			self.article_headline=""
			self.article_alt_headline_list=[]
			self.article_text=""
			article_soup=HTML_page_Obj.html_prettify()
			table = simpleMySQL.verified_select(conn=conn, select_query="select * from website_regex where base_url='%s' order by date_of_addition desc;" %website_base_url)

			if table is not None:
				for i in range(0,len(table)):
					try_code=table[i][1]
					# print try_code
					try:
						exec(try_code)
					except Exception:
						print "Something went wrong while executing that code. Trying next code..."
					else:
						self.article_headline=article_headline.strip()
						self.article_alt_headline_list=article_alt_headline_list
						self.article_text=article_text.strip()
						return
				print "None of the extraction codes for %s have worked on this article. Please re-check them."%website_base_url
	def __init__ (self, conn, HTML_page_Obj=None, article_file_path=None):
		if HTML_page_Obj==None and article_file_path!=None:
			make_from_file(article_file_path)

		elif HTML_page_Obj!=None and article_file_path==None:
			self.article_url=HTML_page_Obj.url
			website_base_url=extraction_text_manip.extract_website(self.article_url)
			print "\n Making article found on %s"%website_base_url
			self.article_headline=""
			self.article_alt_headline_list=[]
			self.article_text=""
			self.article_date=datetime.date(1970, 1, 1)		## Start of UNIX time
			self.article_time=datetime.time(0,0)

			article_soup=HTML_page_Obj.html_prettify()

			table = sqliteDefaults.verified_select_sqlite(conn=conn, select_query="select * from website_regex where base_url='%s' order by date_of_addition desc;" %website_base_url)	##<-CHANGED

			if table is not None:
				for i in range(0,len(table)):
					try_code=table[i][1]
					# print try_code
					try:
						exec(try_code)
					except Exception:
						print "Something went wrong while executing that code. Trying next code..."
					else:
						self.article_headline=article_headline.strip()
						self.article_alt_headline_list=article_alt_headline_list
						self.article_text=article_text.strip()
						self.article_date = article_date
						self.article_time = article_time
						return
				print "None of the extraction codes for %s have worked on this article. \
						Please re-check them."%website_base_url
Esempio n. 3
0
import re
from bs4 import BeautifulSoup
import extraction_text_manip
'''The purpose of this python file is to help you build the code needed to extract articles from websites'''

url = "http://www.livemint.com/Companies/fghWAFAu1k7JYKnUU31g4I/Nestle-asks-Bombay-HC-for-time-to-reply-to-Maharashtra-FDA-a.html"
website = extraction_text_manip.extract_website(url)

html = extraction_text_manip.get_html(url)

#We must set the following:
article_headline = ""
article_alt_headline_list = []
article_text = ""

article_soup = BeautifulSoup(html)
with open("G:/article.html", 'w') as art_file:
    art_file.write(article_soup.prettify().encode('ascii', 'ignore'))

    #start of website-specific code
    #input: article_soup
website_base_url = "livemint.com"

headline_list = article_soup.find("h1", {"class": "sty_head_38"})
article_headline = ""
for i in headline_list:
    article_headline += extraction_text_manip.properly_encode(str(i))
article_headline = extraction_text_manip.properly_format(article_headline)

article_alt_headline_list = []
alt_headline_list = article_soup.find("div", {"class": "sty_sml_summary_18"})
Esempio n. 4
0
import re
from bs4 import BeautifulSoup
import extraction_text_manip

'''The purpose of this python file is to help you build the code needed to extract articles from websites'''


url="http://www.livemint.com/Companies/fghWAFAu1k7JYKnUU31g4I/Nestle-asks-Bombay-HC-for-time-to-reply-to-Maharashtra-FDA-a.html"
website=extraction_text_manip.extract_website(url)

html=extraction_text_manip.get_html(url)

#We must set the following:
article_headline=""
article_alt_headline_list=[]
article_text=""

article_soup=BeautifulSoup(html)
with open("G:/article.html", 'w') as art_file:
	art_file.write(article_soup.prettify().encode('ascii','ignore'))





					#start of website-specific code
					#input: article_soup
website_base_url="livemint.com" 

headline_list=article_soup.find("h1", {"class":"sty_head_38"})
article_headline=""