-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
111 lines (75 loc) · 2.54 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#111 lines (74 sloc) 2.54 KB
#!usr/bin/env
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import re
CNN = 'https://edition.cnn.com/'
webdriver=webdriver.Chrome()
class GetData:
def __init__(self,url):
self.url = url
def scrape(self):
try:
print("scraping www.cnn.com , please wait as it might take a while :)")
webdriver.get('https://edition.cnn.com/')
element = WebDriverWait(webdriver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "cn__title ")))
source = webdriver.page_source
soup = BeautifulSoup(source,'html.parser')
article_links = []
for ul in soup.find_all('ul'):
if(ul.h2 and 'Top' in ul.h2.string): #if the ul element has a h2 child, and Top is in the h2 string
for li in ul.find_all('li'):
article_links.append(li.find('a').get('href'))
self.scrape_from_articles(article_links)
except TimeoutException as exception:
print('timed out, check internet connection and try again')
finally:
print('done')
webdriver.close()
return ''
def scrape_from_articles(self,links):
try:
articles_array = []
for link in links:
webdriver.get('https://edition.cnn.com'+link)
x = WebDriverWait(webdriver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
source = webdriver.page_source
soup = BeautifulSoup(source,'html.parser')
title = soup.h1.string
paragraph = ''
img_links = []
for p in soup.find_all('p'):
if(p.cite):
paragraph=p.get_text()
for img in soup.find_all('img'):
if("media__image" in str(img.get('class')) and not re.match('data:(.*)',img.get('src'))):
img_links.append(img.get('src'))
article = {
"title" : title,
"paragraph" : paragraph,
"links" : img_links
}
articles_array.append(article)
except TimeoutException as exception:
print('timed out, check internet connection and try again')
finally:
self.print_to_screen(articles_array)
return
def print_to_screen(self,array):
print("\nTop Articles: \n")
for article in array:
images=""
for link in article['links']:
images += link[2:] + "\n"
print('\033[1m' + #bold text
article['title']
+ "\033[0m \n" #end bold text
+ article['paragraph']
+ " \nimages: \n"
+ images
+ "\n \n")
GetData(CNN).scrape()