Exemple #1
0
def build_section(section):

    # some articles don't provide a date, we assume they were posted receintly and use the date the data was gathered
    for newspaper_source in list_news_obj:
        # print(f'\nName: {newspaper_source.name}')

        if section in newspaper_source.paths:
            newspaper_stack = []
            section_url = newspaper_source.make_path(section)
            newspaper_build = newspaper.build(section_url)
            newspaper_stack.append(newspaper_build)
            news_pool.set(newspaper_stack,
                          threads_per_source=2)  # (3*2) = 6 threads total
            news_pool.join()

            for downloaded_paper in newspaper_stack:
                articles = downloaded_paper.articles

                for article in articles:
                    # print(article.url)
                    # print(article.title)
                    section = filter_junk_results(article.url,
                                                  newspaper_source.name,
                                                  section)

                    if section:
                        try:
                            article.download()
                            article.parse()

                            title = article.title
                            url = article.url
                            publication = newspaper_source.name
                            city = newspaper_source.place
                            section = section
                            body = article.text
                            image = article.top_image

                            if article.authors:
                                authors = article.authors[0]
                            else:
                                authors = ''

                            try:
                                a = Article(title=title,
                                            url=url,
                                            publication=publication,
                                            city=city,
                                            section=section,
                                            authors=authors,
                                            body=body,
                                            image=image)
                                a.save()
                                print(f'created new article: {a.title}')
                            except django.db.utils.IntegrityError as e:
                                print('Duplicate entry, not added.', e)
                            except Exception as e:
                                print(e)
                            print(
                                f'Title: {title}, url: {url}, publication: {publication}, city: {city}\nsection: {section}, authors: {authors}'
                            )
                        except Exception as e:
                            print(e)