Beispiel #1
0
 def download_articles(self):
     """Searches database table rawurl for undownloaded urls (state = -1), downloads and stores them
     """
     undownloaded_rawurls = self.session.query(Rawurl).filter_by(state = Rawurl.STATE_NOT_DOWNLOADED).all()
     
     for rawurl in undownloaded_rawurls:
         #download
         pagehtml = my_urlopen(rawurl.url)
         
         #store
         soup = BeautifulSoup(pagehtml)
         rawurl.html_content = str(soup)
         rawurl.download_date = date_get_today()
         rawurl.state = Rawurl.STATE_UNPARSED
     self.session.commit()
Beispiel #2
0
 def get_date_from_soup(self, soup):
     try:
         datetag = soup.find('div', attrs={'class': 'time'})
         if datetag is None:
             datetag = soup.find('p', attrs={'class': 'autor_line'}).contents[0]
             
         date = datetag.text
         date = date.split(" ")
         
         if date[1]=='dnes':
             article_date = date_get_today()
         elif date[1]=='včera':
             article_date = date_get_yesterday()
         else:
             date = date[1:4]
             date = " ".join(date)
             article_date = datetime.datetime.strptime(date, "%d. %m. %Y").date()
         return article_date
     except Exception:
         return None
Beispiel #3
0
 def store_link(self, link, htmlcontent):
     """Stores a link, with htmlcontent in db, with current date
     """
     item = Rawurl(link, Rawurl.STATE_UNPARSED, htmlcontent, date_get_today())
     self.session.add(item)
     return item