def build_section(section): # some articles don't provide a date, we assume they were posted receintly and use the date the data was gathered for newspaper_source in list_news_obj: # print(f'\nName: {newspaper_source.name}') if section in newspaper_source.paths: newspaper_stack = [] section_url = newspaper_source.make_path(section) newspaper_build = newspaper.build(section_url) newspaper_stack.append(newspaper_build) news_pool.set(newspaper_stack, threads_per_source=2) # (3*2) = 6 threads total news_pool.join() for downloaded_paper in newspaper_stack: articles = downloaded_paper.articles for article in articles: # print(article.url) # print(article.title) section = filter_junk_results(article.url, newspaper_source.name, section) if section: try: article.download() article.parse() title = article.title url = article.url publication = newspaper_source.name city = newspaper_source.place section = section body = article.text image = article.top_image if article.authors: authors = article.authors[0] else: authors = '' try: a = Article(title=title, url=url, publication=publication, city=city, section=section, authors=authors, body=body, image=image) a.save() print(f'created new article: {a.title}') except django.db.utils.IntegrityError as e: print('Duplicate entry, not added.', e) except Exception as e: print(e) print( f'Title: {title}, url: {url}, publication: {publication}, city: {city}\nsection: {section}, authors: {authors}' ) except Exception as e: print(e)