def get_docs(self):
     crwl=Crawler()
     for page in self.pagelist:
         if page != '#' and page != 'mailto:[email protected]' and page !=None:
             if(crwl.get_page(page)!=True):
                 continue
             soup=crwl.return_soup()
             content=soup.find("div",{"class":"article-text"})
             if content != None:
                 div=content.find('div',id='articleKeywords')
                 if div != None:
                     div.decompose()
                 div=content.find('div',id='addshare')
                 if div != None:
                     div.decompose()
                 div=content.find('div',{'class':'rel-block-sec'})
                 if div != None:
                     div.decompose()
                 div=content.find('div',{'class':'photo-caption'})
                 if div != None:
                     div.decompose()
                 div=content.find('div',{'class':'related-column'})
                 if div != None:
                     div.decompose()
                 x=[s.extract() for s in content('script')]
                 text=content.text
                 text=re.sub('[\n]+',' ',text)
                 text=re.sub('[ ]+',' ',text)
                 text=text.strip()
                 if(len(text)<=10):
                     self.error_pagelist.append(page)
                 else:
                     self.final_docs.append(text)
 def __init__(self):
     crwl=Crawler()
     crwl.get_pagelist()
     self.pagelist=crwl.return_pagelist()
     self.soup=crwl.return_soup()
     self.articledb=ArticleDb('localhost',27017)
     self.articledb.init_backend('testdb','testcol')
     self.final_docs=[]
     self.error_pagelist=[]