Example #1
0
    def article_supersplit(self, article=None):
        if article == None:
            article = self.article_text

        article = extraction_text_manip.properly_format(article)
        '''
		This function splits a "properly_format"ed article,
		and returns the variable 'text'.

		'text' is structured as:
			a list of paragraphs,
				where each paragraph is a list of sentences,
					where each sentence is a list of words, punctuations as seperate words.
		'''
        text = article.split("\n")  #get paragraphs
        text = extraction_text_manip.remove_empty_from_list(text)
        for i in range(0, len(text)):
            text[i] = text[i].split(". ")  #get sentences
            text[i] = remove_empty_from_list(text[i])
            for j in range(0, len(text[i])):
                try:
                    # print "\ntrying NLTK"
                    text[i][j] = nltk.word_tokenize(text[i][j])
                    # print "\nNLTK success"
                except Exception:
                    # print "\n\nNLTK failed. Going for backup..."
                    text[i][j] = text[i][j].split(" ")  #get words
                    text[i][j] += "."
                    for k in range(0, len(text[i][j])):
                        text[i][j][k] = re.sub(",", "", text[i][j][k])
                        text[i][j][k] = re.sub(";", "", text[i][j][k])
                        text[i][j][k] = re.sub("\(", "", text[i][j][k])
                        text[i][j][k] = re.sub("\)", "", text[i][j][k])
                        text[i][j][k] = re.sub("\[", "", text[i][j][k])
                        text[i][j][k] = re.sub("\]", "", text[i][j][k])
                        text[i][j][k] = re.sub("\{", "", text[i][j][k])
                        text[i][j][k] = re.sub("\}", "", text[i][j][k])

                    if text[i][-1][-2][-1] == ".":
                        # print text[i][-1]
                        text[i][-1][-2] = re.sub(".*", text[i][-1][-2][:-1],
                                                 text[i][-1][-2])
                    # print "\nreplaced: %s\n\n\n"%text[i][-1]
                finally:
                    text[i][j] = remove_empty_from_list(text[i][j])

        return text
	def article_supersplit(self, article=None):
		if article==None:
			article=self.article_text

		article=extraction_text_manip.properly_format(article)
		'''
		This function splits a "properly_format"ed article,
		and returns the variable 'text'.

		'text' is structured as:
			a list of paragraphs,
				where each paragraph is a list of sentences,
					where each sentence is a list of words, punctuations as seperate words.
		'''
		text=article.split("\n") #get paragraphs
		text = extraction_text_manip.remove_empty_from_list(text)
		for i in range(0,len(text)):
			text[i]=text[i].split(". ") #get sentences
			text[i]=remove_empty_from_list(text[i])
			for j in range(0,len(text[i])):
				try:
					# print "\ntrying NLTK"
					text[i][j]=nltk.word_tokenize(text[i][j])
					# print "\nNLTK success"
				except Exception:
					# print "\n\nNLTK failed. Going for backup..."
					text[i][j]=text[i][j].split(" ") #get words
					text[i][j]+="."
					for k in range(0,len(text[i][j])):
						text[i][j][k]=re.sub(",", "", text[i][j][k])
						text[i][j][k]=re.sub(";", "", text[i][j][k])
						text[i][j][k]=re.sub("\(", "", text[i][j][k])
						text[i][j][k]=re.sub("\)", "", text[i][j][k])
						text[i][j][k]=re.sub("\[", "", text[i][j][k])
						text[i][j][k]=re.sub("\]", "", text[i][j][k])
						text[i][j][k]=re.sub("\{", "", text[i][j][k])
						text[i][j][k]=re.sub("\}", "", text[i][j][k])

					if text[i][-1][-2][-1] == ".":
						# print text[i][-1]
						text[i][-1][-2]=re.sub(".*", text[i][-1][-2][:-1], text[i][-1][-2])
					# print "\nreplaced: %s\n\n\n"%text[i][-1]
				finally:
					text[i][j]=remove_empty_from_list(text[i][j])

		return text
article_alt_headline_list = []
article_text = ""

article_soup = BeautifulSoup(html)
with open("G:/article.html", 'w') as art_file:
    art_file.write(article_soup.prettify().encode('ascii', 'ignore'))

    #start of website-specific code
    #input: article_soup
website_base_url = "livemint.com"

headline_list = article_soup.find("h1", {"class": "sty_head_38"})
article_headline = ""
for i in headline_list:
    article_headline += extraction_text_manip.properly_encode(str(i))
article_headline = extraction_text_manip.properly_format(article_headline)

article_alt_headline_list = []
alt_headline_list = article_soup.find("div", {"class": "sty_sml_summary_18"})
article_alt_headline = ""
for i in alt_headline_list:
    article_alt_headline += extraction_text_manip.properly_encode(str(i))
article_alt_headline = extraction_text_manip.properly_format(
    article_alt_headline)
article_alt_headline_list.append(article_alt_headline)

date_list = article_soup.findAll("div", {'class': "sty_posted_txt"})
article_dateline = ""
for i in date_list:
    article_dateline += extraction_text_manip.properly_encode(str(i))
# print article_dateline
with open("G:/article.html", 'w') as art_file:
	art_file.write(article_soup.prettify().encode('ascii','ignore'))





					#start of website-specific code
					#input: article_soup
website_base_url="livemint.com" 

headline_list=article_soup.find("h1", {"class":"sty_head_38"})
article_headline=""
for i in headline_list:
	article_headline+=extraction_text_manip.properly_encode(str(i))
article_headline=extraction_text_manip.properly_format(article_headline)

article_alt_headline_list=[]
alt_headline_list=article_soup.find("div", {"class":"sty_sml_summary_18"})
article_alt_headline=""
for i in alt_headline_list:
	article_alt_headline+=extraction_text_manip.properly_encode(str(i))
article_alt_headline=extraction_text_manip.properly_format(article_alt_headline)
article_alt_headline_list.append(article_alt_headline)


date_list=article_soup.findAll("div", {'class':"sty_posted_txt"})
article_dateline=""
for i in date_list:
	article_dateline+=extraction_text_manip.properly_encode(str(i))
# print article_dateline