def fetch_articles():
    print(str(datetime.utcnow()) + ': fetching urls')
    urls = parser.fetchurls()
    print(str(datetime.utcnow()) + ': fetching urls done')
    print(str(datetime.utcnow()) + ': downloading new articles')
    progresscounter = 0
    for url in urls:
        progresscounter += 1
        progress = (progresscounter / len(urls)) * 100

        #article already in database
        if len(Article.objects.filter(url=url)) != 0:
            continue

        article = Article()
        try:
            response = urlopen(url)
        except URLError as e:
            print(e.__str__())
            continue
        doc = response.read()
        htmldoc = html.fromstring(doc)

        try:
            article.url = url
            article.inputdate = parser.parseinputdate(htmldoc)
            article.title = parser.parsetitle(htmldoc)
            article.content = parser.parsecontent(htmldoc)
            collection, descriptions = parser.parselocation(htmldoc)
            article.location = collection
            article.location_description = descriptions

        except Exception as e:
            print(str(datetime.utcnow()) + ": Error in parsing with URL: " + url + "\nerror:" + e.__str__())
        try:
            article.save()
        except Exception as e:
            print(str(datetime.utcnow()) + ': Error saving article with ID ' + str(article.id) + ' exc:' + e.__str__())

        try:
            image_urls = parser.parse_images(htmldoc)
            for i, image_url in enumerate(image_urls):
                filename = str(article.id) + '_' + str(i)
                image = ArticleFile()
                image.article_fk = article
                (full, thumbnail) = image_url
                full_name = filename + os.path.splitext(full)[1]
                thumbnail_name = filename + '_t' + os.path.splitext(thumbnail)[1]

                image.file.save(full_name, download_image(full))
                image.thumbnail.save(thumbnail_name, download_image(thumbnail))
                image.save()
        except Exception as e:
            print(str(datetime.utcnow()) + ': Error parsing URL: ' + url + '\nerror:' + e.__str__())

        #progressindicator
        sys.stdout.write('\r' + "%.2f" % progress + '%')
    print(str(datetime.utcnow()) + ': Fetched all Articles!')
    call_command('push_relevant_articles')
Example #2
0
def test():
    from urllib.request import urlopen
    from lxml import html
    import berlin_feed.management.commands.parser as feedparser

    response = urlopen('http://www.berlin.de/polizei/presse-fahndung/archiv/395109/index.html')
    doc = response.read()
    htmldoc = html.fromstring(doc)
    images = feedparser.parse_images(htmldoc)
    (full, thumb) = images[0]
    result = urllib.request.urlretrieve('http://www.berlin.de/' + full)
    file = File(open(result[0]))
    print(file)