def get_docs(self): crwl=Crawler() for page in self.pagelist: if page != '#' and page != 'mailto:[email protected]' and page !=None: if(crwl.get_page(page)!=True): continue soup=crwl.return_soup() content=soup.find("div",{"class":"article-text"}) if content != None: div=content.find('div',id='articleKeywords') if div != None: div.decompose() div=content.find('div',id='addshare') if div != None: div.decompose() div=content.find('div',{'class':'rel-block-sec'}) if div != None: div.decompose() div=content.find('div',{'class':'photo-caption'}) if div != None: div.decompose() div=content.find('div',{'class':'related-column'}) if div != None: div.decompose() x=[s.extract() for s in content('script')] text=content.text text=re.sub('[\n]+',' ',text) text=re.sub('[ ]+',' ',text) text=text.strip() if(len(text)<=10): self.error_pagelist.append(page) else: self.final_docs.append(text)
def __init__(self): crwl=Crawler() crwl.get_pagelist() self.pagelist=crwl.return_pagelist() self.soup=crwl.return_soup() self.articledb=ArticleDb('localhost',27017) self.articledb.init_backend('testdb','testcol') self.final_docs=[] self.error_pagelist=[]