def article_supersplit(self, article=None): if article == None: article = self.article_text article = extraction_text_manip.properly_format(article) ''' This function splits a "properly_format"ed article, and returns the variable 'text'. 'text' is structured as: a list of paragraphs, where each paragraph is a list of sentences, where each sentence is a list of words, punctuations as seperate words. ''' text = article.split("\n") #get paragraphs text = extraction_text_manip.remove_empty_from_list(text) for i in range(0, len(text)): text[i] = text[i].split(". ") #get sentences text[i] = remove_empty_from_list(text[i]) for j in range(0, len(text[i])): try: # print "\ntrying NLTK" text[i][j] = nltk.word_tokenize(text[i][j]) # print "\nNLTK success" except Exception: # print "\n\nNLTK failed. Going for backup..." text[i][j] = text[i][j].split(" ") #get words text[i][j] += "." for k in range(0, len(text[i][j])): text[i][j][k] = re.sub(",", "", text[i][j][k]) text[i][j][k] = re.sub(";", "", text[i][j][k]) text[i][j][k] = re.sub("\(", "", text[i][j][k]) text[i][j][k] = re.sub("\)", "", text[i][j][k]) text[i][j][k] = re.sub("\[", "", text[i][j][k]) text[i][j][k] = re.sub("\]", "", text[i][j][k]) text[i][j][k] = re.sub("\{", "", text[i][j][k]) text[i][j][k] = re.sub("\}", "", text[i][j][k]) if text[i][-1][-2][-1] == ".": # print text[i][-1] text[i][-1][-2] = re.sub(".*", text[i][-1][-2][:-1], text[i][-1][-2]) # print "\nreplaced: %s\n\n\n"%text[i][-1] finally: text[i][j] = remove_empty_from_list(text[i][j]) return text
def article_supersplit(self, article=None): if article==None: article=self.article_text article=extraction_text_manip.properly_format(article) ''' This function splits a "properly_format"ed article, and returns the variable 'text'. 'text' is structured as: a list of paragraphs, where each paragraph is a list of sentences, where each sentence is a list of words, punctuations as seperate words. ''' text=article.split("\n") #get paragraphs text = extraction_text_manip.remove_empty_from_list(text) for i in range(0,len(text)): text[i]=text[i].split(". ") #get sentences text[i]=remove_empty_from_list(text[i]) for j in range(0,len(text[i])): try: # print "\ntrying NLTK" text[i][j]=nltk.word_tokenize(text[i][j]) # print "\nNLTK success" except Exception: # print "\n\nNLTK failed. Going for backup..." text[i][j]=text[i][j].split(" ") #get words text[i][j]+="." for k in range(0,len(text[i][j])): text[i][j][k]=re.sub(",", "", text[i][j][k]) text[i][j][k]=re.sub(";", "", text[i][j][k]) text[i][j][k]=re.sub("\(", "", text[i][j][k]) text[i][j][k]=re.sub("\)", "", text[i][j][k]) text[i][j][k]=re.sub("\[", "", text[i][j][k]) text[i][j][k]=re.sub("\]", "", text[i][j][k]) text[i][j][k]=re.sub("\{", "", text[i][j][k]) text[i][j][k]=re.sub("\}", "", text[i][j][k]) if text[i][-1][-2][-1] == ".": # print text[i][-1] text[i][-1][-2]=re.sub(".*", text[i][-1][-2][:-1], text[i][-1][-2]) # print "\nreplaced: %s\n\n\n"%text[i][-1] finally: text[i][j]=remove_empty_from_list(text[i][j]) return text
article_alt_headline_list = [] article_text = "" article_soup = BeautifulSoup(html) with open("G:/article.html", 'w') as art_file: art_file.write(article_soup.prettify().encode('ascii', 'ignore')) #start of website-specific code #input: article_soup website_base_url = "livemint.com" headline_list = article_soup.find("h1", {"class": "sty_head_38"}) article_headline = "" for i in headline_list: article_headline += extraction_text_manip.properly_encode(str(i)) article_headline = extraction_text_manip.properly_format(article_headline) article_alt_headline_list = [] alt_headline_list = article_soup.find("div", {"class": "sty_sml_summary_18"}) article_alt_headline = "" for i in alt_headline_list: article_alt_headline += extraction_text_manip.properly_encode(str(i)) article_alt_headline = extraction_text_manip.properly_format( article_alt_headline) article_alt_headline_list.append(article_alt_headline) date_list = article_soup.findAll("div", {'class': "sty_posted_txt"}) article_dateline = "" for i in date_list: article_dateline += extraction_text_manip.properly_encode(str(i)) # print article_dateline
with open("G:/article.html", 'w') as art_file: art_file.write(article_soup.prettify().encode('ascii','ignore')) #start of website-specific code #input: article_soup website_base_url="livemint.com" headline_list=article_soup.find("h1", {"class":"sty_head_38"}) article_headline="" for i in headline_list: article_headline+=extraction_text_manip.properly_encode(str(i)) article_headline=extraction_text_manip.properly_format(article_headline) article_alt_headline_list=[] alt_headline_list=article_soup.find("div", {"class":"sty_sml_summary_18"}) article_alt_headline="" for i in alt_headline_list: article_alt_headline+=extraction_text_manip.properly_encode(str(i)) article_alt_headline=extraction_text_manip.properly_format(article_alt_headline) article_alt_headline_list.append(article_alt_headline) date_list=article_soup.findAll("div", {'class':"sty_posted_txt"}) article_dateline="" for i in date_list: article_dateline+=extraction_text_manip.properly_encode(str(i)) # print article_dateline