def fetch_articles(): print(str(datetime.utcnow()) + ': fetching urls') urls = parser.fetchurls() print(str(datetime.utcnow()) + ': fetching urls done') print(str(datetime.utcnow()) + ': downloading new articles') progresscounter = 0 for url in urls: progresscounter += 1 progress = (progresscounter / len(urls)) * 100 #article already in database if len(Article.objects.filter(url=url)) != 0: continue article = Article() try: response = urlopen(url) except URLError as e: print(e.__str__()) continue doc = response.read() htmldoc = html.fromstring(doc) try: article.url = url article.inputdate = parser.parseinputdate(htmldoc) article.title = parser.parsetitle(htmldoc) article.content = parser.parsecontent(htmldoc) collection, descriptions = parser.parselocation(htmldoc) article.location = collection article.location_description = descriptions except Exception as e: print(str(datetime.utcnow()) + ": Error in parsing with URL: " + url + "\nerror:" + e.__str__()) try: article.save() except Exception as e: print(str(datetime.utcnow()) + ': Error saving article with ID ' + str(article.id) + ' exc:' + e.__str__()) try: image_urls = parser.parse_images(htmldoc) for i, image_url in enumerate(image_urls): filename = str(article.id) + '_' + str(i) image = ArticleFile() image.article_fk = article (full, thumbnail) = image_url full_name = filename + os.path.splitext(full)[1] thumbnail_name = filename + '_t' + os.path.splitext(thumbnail)[1] image.file.save(full_name, download_image(full)) image.thumbnail.save(thumbnail_name, download_image(thumbnail)) image.save() except Exception as e: print(str(datetime.utcnow()) + ': Error parsing URL: ' + url + '\nerror:' + e.__str__()) #progressindicator sys.stdout.write('\r' + "%.2f" % progress + '%') print(str(datetime.utcnow()) + ': Fetched all Articles!') call_command('push_relevant_articles')
def test(): from urllib.request import urlopen from lxml import html import berlin_feed.management.commands.parser as feedparser response = urlopen('http://www.berlin.de/polizei/presse-fahndung/archiv/395109/index.html') doc = response.read() htmldoc = html.fromstring(doc) images = feedparser.parse_images(htmldoc) (full, thumb) = images[0] result = urllib.request.urlretrieve('http://www.berlin.de/' + full) file = File(open(result[0])) print(file)