def dl(url): url = url.strip() if should_exclude(url): return ext = tldextract.extract(url) domain = '.'.join([x for x in ext if x]) fname = 'data/{}-{}.txt'.format(domain, hash(url.encode()).hexdigest()) if os.path.isfile(fname): return # print('Downloading', url) try: article = newspaper.Article(url, fetch_images=False) article.download() article.parse() except newspaper.article.ArticleException: # print('Dead link:', url) return # traceback.print_exc() text = article.text if text.strip() == '': # print('Empty') return with open(fname, 'w') as out: out.write(text)
def newspaper_scraper(url, memoize): t1 = time.time() if should_exclude(url): # heuristic to make downloading faster return None, { "url": url, "scraper": "newspaper", } try: article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize) article.download() article.parse() text = article.text count = len(text.split()) except: return None, { "url": url, "scraper": "newspaper", } metadata = { "url": url, "word_count": count, "elapsed": time.time() - t1, "scraper": "newspaper", } return text, metadata
def bs4_scraper(url, memoize): t1 = time.time() if should_exclude(url): # heuristic to make downloading faster return None, { "url": url, "scraper": "bs4", } try: article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize) article.download() html = article.html soup = bs4.BeautifulSoup(html, "lxml") text, count = find_and_filter_tag("p", soup) # DDB: keep text as a single string for consistency with # newspaper_scraper text = " ".join(text) except: return None, { "url": url, "scraper": "bs4", } metadata = { "url": url, "word_count": count, "elapsed": time.time() - t1, "scraper": "bs4", } return text, metadata
def raw_scraper(url, memoize): t1 = time.time() if should_exclude(url): # heuristic to make downloading faster return None, { "url": url, "scraper": "raw", } try: cleaner = Cleaner() cleaner.javascript = True cleaner.style = True article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize) article.download() html = minify(article.html) html = cleaner.clean_html(html) article.parse() except: return None, { "url": url, "scraper": "raw", } if article.text == "": return None, { "url": url, "scraper": "raw", } metadata = {"url": url, "elapsed": time.time() - t1, "scraper": "raw"} return html, metadata
for subreddit, gt_score in tqdm(queries): query = api.search_submissions( after=start_time, before=end_time, filter=['url', 'score'], sort_type='score', sort='desc', score=f'>{gt_score}', subreddit=subreddit, is_self=False, #limit=1000000 over_18=False) used_links = 0 for i, subm in enumerate(query): url = subm.url if should_exclude(url) or url in done_urls: continue # weird issue with psaw/pushshift that breaks score=">2" #if i % 100 == 0: # tqdm.write(str(subm.score)) if not (subm.score >= gt_score): continue #print(subm.score) # pbar.write(str(datetime.datetime.fromtimestamp(subm.created_utc))) fh.write(url + '\n') used_links += 1 done_urls.add(url) tqdm.write(f"Used {used_links} from {subreddit}") fh.flush()