Ejemplo n.º 1
0
def dl(url):
    url = url.strip()

    if should_exclude(url):
        return

    ext = tldextract.extract(url)
    domain = '.'.join([x for x in ext if x])

    fname = 'data/{}-{}.txt'.format(domain, hash(url.encode()).hexdigest())
    if os.path.isfile(fname):
        return
#    print('Downloading', url)
    try:
        article = newspaper.Article(url, fetch_images=False)
        article.download()
        article.parse()
    except newspaper.article.ArticleException:
        #        print('Dead link:', url)
        return


#        traceback.print_exc()

    text = article.text

    if text.strip() == '':
        #        print('Empty')
        return

    with open(fname, 'w') as out:
        out.write(text)
Ejemplo n.º 2
0
def newspaper_scraper(url, memoize):
    t1 = time.time()
    if should_exclude(url):
        # heuristic to make downloading faster
        return None, {
            "url": url,
            "scraper": "newspaper",
        }

    try:
        article = newspaper.Article(url,
                                    fetch_images=False,
                                    memoize_articles=memoize)
        article.download()
        article.parse()
        text = article.text
        count = len(text.split())
    except:
        return None, {
            "url": url,
            "scraper": "newspaper",
        }

    metadata = {
        "url": url,
        "word_count": count,
        "elapsed": time.time() - t1,
        "scraper": "newspaper",
    }
    return text, metadata
Ejemplo n.º 3
0
def bs4_scraper(url, memoize):
    t1 = time.time()
    if should_exclude(url):
        # heuristic to make downloading faster
        return None, {
            "url": url,
            "scraper": "bs4",
        }

    try:
        article = newspaper.Article(url,
                                    fetch_images=False,
                                    memoize_articles=memoize)
        article.download()
        html = article.html
        soup = bs4.BeautifulSoup(html, "lxml")
        text, count = find_and_filter_tag("p", soup)
        # DDB: keep text as a single string for consistency with
        # newspaper_scraper
        text = " ".join(text)
    except:
        return None, {
            "url": url,
            "scraper": "bs4",
        }

    metadata = {
        "url": url,
        "word_count": count,
        "elapsed": time.time() - t1,
        "scraper": "bs4",
    }
    return text, metadata
Ejemplo n.º 4
0
def raw_scraper(url, memoize):
    t1 = time.time()
    if should_exclude(url):
        # heuristic to make downloading faster
        return None, {
            "url": url,
            "scraper": "raw",
        }

    try:
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        article = newspaper.Article(url,
                                    fetch_images=False,
                                    memoize_articles=memoize)
        article.download()
        html = minify(article.html)
        html = cleaner.clean_html(html)
        article.parse()
    except:
        return None, {
            "url": url,
            "scraper": "raw",
        }
    if article.text == "":
        return None, {
            "url": url,
            "scraper": "raw",
        }

    metadata = {"url": url, "elapsed": time.time() - t1, "scraper": "raw"}
    return html, metadata
Ejemplo n.º 5
0
        for subreddit, gt_score in tqdm(queries):
            query = api.search_submissions(
                after=start_time,
                before=end_time,
                filter=['url', 'score'],
                sort_type='score',
                sort='desc',
                score=f'>{gt_score}',
                subreddit=subreddit,
                is_self=False,
                #limit=1000000
                over_18=False)
            used_links = 0
            for i, subm in enumerate(query):
                url = subm.url
                if should_exclude(url) or url in done_urls:
                    continue

                # weird issue with psaw/pushshift that breaks score=">2"
                #if i % 100 == 0:
                #    tqdm.write(str(subm.score))
                if not (subm.score >= gt_score):
                    continue
                #print(subm.score)
    #            pbar.write(str(datetime.datetime.fromtimestamp(subm.created_utc)))
                fh.write(url + '\n')
                used_links += 1
                done_urls.add(url)
            tqdm.write(f"Used {used_links} from {subreddit}")
        fh.flush()