def main(): test_url = "http://gazeta.ru/" txt = urlOpen.get_html(test_url) txt = textParser.tags_filter_head_and_script(txt) for url, text in get_url_and_url_text(txt, test_url): if url.startswith(test_url): print("{} {}\n".format(url, text))
def parse(source_url: str): html_code = urlOpen.get_html(source_url) html_code = textParser.tags_filter_head_and_script(html_code) for url, text in get_url_and_url_text(html_code, source_url): yield model.NewsData(url=url, title=text, pub_date=datetime.now(), summary=None)
def worker(news): html = urlOpen.get_html(news.url) print(str(news.pk) + " ", end='\n') if html: text = textParser.get_text_from_html(html) url_list = [url for url in aParser.get_a_from_news_text(news_url=news.url, text=text)] text = aParser.remove_all_tags(text) text = text_prerparer.text_preparer(text) return NewsText(news=news, text=text), url_list
def parse_news(n=None): for news in News.objects.filter(is_parsed=False)[:n].iterator(): print(str(news.id) + " ", end='\n') html = urlOpen.get_html(news.url) # 0.19 - 2.5 s if html: text = textParser.get_text_from_html(html) # 0.0099 - 0.026 s NewsText.objects.create(news=news, text=text) news.is_parsed = True news.save() # 0.004 with atomic and 0.23 without
def worker(news): html = urlOpen.get_html(news.url) print(str(news.pk) + " ", end='\n') if html: text = textParser.get_text_from_html(html) url_list = [ url for url in aParser.get_a_from_news_text(news_url=news.url, text=text) ] text = aParser.remove_all_tags(text) text = text_prerparer.text_preparer(text) return NewsText(news=news, text=text), url_list
def worker(input_q: JoinableQueue, output: Queue): from django import db db.connection.close() while True: task = input_q.get() if task == "end": break html = urlOpen.get_html(task.url) if html: text = textParser.get_text_from_html(html) input_q.task_done() # info() output.put(task.url) print("exit")
def worker(news): print(str(news.id) + " ", end='\n') html = urlOpen.get_html(news.url) if html: text = textParser.get_text_from_html(html) return NewsText(news=news, text=text)