def scrape(curr_url, hash, soup, results): print('Found defencereview.gr...') # article for t in soup.find_all('body', class_='single-post'): if len(soup.find_all('article', class_='post')) > 0: print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' result = result + '\"meta\":\"' for c in t.find_all('div', class_='newsmag-post-meta'): for d in c.select('a'): result = result + utils.clean_soup(d) + ' ' for d in c.select('div.newsmag-date'): result = result + utils.clean_soup(d) + ' ' result = result + '\",' result = result + '\"title\":\"' for c in t.select('div.newsmag-custom-header'): # result = result + utils.clean_soup(c) result = result + '\"' result = result + '},' result = result + '\"text\":\"' for c in t.find_all('div', class_='entry-content'): for d in c.find_all(class_=None, recursive=False): result = result + utils.clean_soup(d) + ' ' result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result) # comments for t in soup.find_all('div', id='comments'): print('Getting wordpress comments...') for c in t.find_all('div', class_='comment-content'): result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"comment\",' result = result + '\"source\":\"' + curr_url + '\"' result = result + '},' result = result + '\"text\":\"' for d in c.find_all('p', class_=''): # result = result + utils.clean_soup(d) result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found disidentia.com...') counter = 0 # article for t in soup.find_all('article', class_='type-post'): print('Getting wordpress article...') counter += 1 result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + str(counter) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' for c in t.find_all('div', class_='td-module-meta-info'): result = result + '\"meta\":\"' + utils.clean_soup(c) + '\",' break for c in t.find_all('h1', class_='entry-title'): result = result + '\"title\":\"' + utils.clean_soup(c) break result = result + '\"' result = result + '},' for c in t.find_all('div', class_='td-post-content tagdiv-type'): result = result + '\"text\":\"' for d in c.find_all('p', class_=''): result = result + utils.clean_soup(d) result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result) # comments if len(soup.find_all('ol', class_='comment-list')) > 0: print('Getting custom comments...') for t in soup.find_all('div', class_='comment-content'): counter += 1 result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + str(counter) + '\",' result = result + '\"type\":\"comment\",' result = result + '\"source\":\"' + curr_url + '\"' result = result + '},' result = result + '\"text\":\"' + utils.clean_soup(t) + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found katohika.gr...') # article for t in soup.find_all('div', id='content'): if len(soup.find_all('div', class_='entry-content')) > 0: print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' for c in t.find_all('div', class_='entry-author'): result = result + '\"meta\":\"' + utils.clean_soup(c) + '\",' for c in t.find_all('h1', class_='entry-title'): result = result + '\"title\":\"' + utils.clean_soup(c) + '\"' result = result + '},' result = result + '\"text\":\"' for c in t.find_all('div', class_='entry-content'): for d in c.find_all(class_=None, recursive=False): result = result + utils.clean_soup(d) + ' ' result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result) # comments for t in soup.find_all('div', id='comments-section'): print('Getting wordpress comments...') for c in t.find_all('div', class_='comment-content'): result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"comment\",' result = result + '\"source\":\"' + curr_url + '\"' result = result + '},' result = result + '\"text\":\"' for d in c.find_all('p', class_=''): result = result + utils.clean_soup(d) result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found periodistadigital.com...') for t in soup.find_all('div', id='m4p-post-detail'): print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' for c in t.find_all('div', class_='m4p-author_time'): result = result + '\"meta\":\"' + utils.clean_soup(c) result = result + '\",' for c in t.find_all('h1', class_='m4p-size-1'): result = result + '\"title\":\"' + utils.clean_soup(c) result = result + '\"' result = result + '},' for c in t.find_all('div', class_='m4p-post-content'): result = result + '\"text\":\"' + utils.clean_soup(c) + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found okdiario.com...') for t in soup.find_all('article', class_='post'): if len(t.find_all('div', class_='entry-content')) > 0: print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' result = result + '\"meta\":\"' for c in t.find_all('address', class_='autor'): result = result + utils.clean_soup(c) result = result + '\",' result = result + '\"title\":\"' for c in t.find_all('h1', class_='entry-title'): result = result + utils.clean_soup(c) result = result + '\"' result = result + '},' for c in t.find_all('div', class_='entry-content'): result = result + '\"text\":\"' + utils.clean_soup(c) + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found arxaiaithomi.gr...') # article for t in soup.find_all('div', class_='post'): if len(soup.find_all('body', class_='single-post')) > 0: print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' result = result + '\"meta\":\"' for c in t.find_all('div', class_='post-footer'): # result = result + utils.clean_soup(c) result = result + '\",' result = result + '\"title\":\"' for c in t.select('div.post-headline > h2'): # result = result + utils.clean_soup(c) result = result + '\"' result = result + '},' result = result + '\"text\":\"' for c in t.find_all('div', class_='post-bodycopy'): for d in c.find_all(recursive=False): if d.name != 'div': result = result + utils.clean_soup(d) + ' ' result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found ekklisiaonline.gr...') # article for t in soup.find_all('body', class_='single-post'): if len(soup.find_all('article', class_='')) > 0: print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' result = result + '\"meta\":\"' for c in t.select('h6.entry-date'): # result = result + utils.clean_soup(c) + ' ' result = result + '\",' result = result + '\"title\":\"' for c in t.find_all('h1', class_='entry-title'): result = result + utils.clean_soup(c) break result = result + '\"' result = result + '},' result = result + '\"text\":\"' for c in t.select('div#article > article'): for d in c.find_all('p', class_=None, recursive=False): result = result + utils.clean_soup(d) + ' ' result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found alertadigital.com...') for t in soup.find_all('div', id='homepost'): if len(t.find_all('div', class_='post')) > 0: print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' for c in t.find_all('div', id='datemeta'): result = result + '\"meta\":\"' + utils.clean_soup(c) + '\",' for c in t.find_all('h2', class_=''): result = result + '\"title\":\"' + utils.clean_soup(c) result = result + '\"' result = result + '},' for c in t.find_all('div', class_='entry'): result = result + '\"text\":\"' + utils.clean_soup(c) + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found makeleio.gr...') # article for t in soup.find_all('div', class_='single-style1-wrap'): print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' for c in t.find_all('div', class_='single-style1-meta-tag'): result = result + '\"meta\":\"' + utils.clean_soup(c) + '\",' for c in t.find_all('div', class_='single-style1-title'): result = result + '\"title\":\"' + utils.clean_soup(c) + '\"' result = result + '},' for c in t.find_all('div', class_='single-style1-content'): result = result + '\"text\":\"' + utils.clean_soup(c) + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result) # comments for t in soup.find_all('div', class_='comments-area'): print('Getting wordpress comments...') for c in t.find_all('div', class_='comment-content'): result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"comment\",' result = result + '\"source\":\"' + curr_url + '\"' result = result + '},' result = result + '\"text\":\"' for d in c.find_all('p', class_=''): result = result + utils.clean_soup(d) result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found hellenicns.gr...') # articles for t in soup.find_all('body', class_='single-post'): if len(soup.find_all('article', class_='post')) > 0: print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' result = result + '\"meta\":\"' for c in t.find_all('div', class_='below-entry-meta'): for d in c.select('time.published'): result = result + utils.clean_soup(d) + ' ' for d in c.select('span.author > a'): result = result + utils.clean_soup(d) + ' ' for d in c.select('span.tag-links'): result = result + utils.clean_soup(d) + ' ' break result = result + '\",' result = result + '\"title\":\"' for c in t.find_all('h1', class_='entry-title'): result = result + utils.clean_soup(c) break result = result + '\",' result = result + '\"text\":\"' for c in t.find_all('div', class_='entry-content'): for d in c.find_all(class_=None, recursive=False): result = result + utils.clean_soup(d) + ' ' for c in t.find_all('div', class_='single-content'): for d in c.find_all(class_=None, recursive=False): result = result + utils.clean_soup(d) + ' ' result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found vimaorthodoxias.gr...') for t in soup.find_all('div', class_='post-wrap'): if len(soup.find_all('body', class_='single-post')) > 0: print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' result = result + '\"meta\":\"' for c in t.find_all('div', class_='jeg_meta_container'): for d in c.select('div.jeg_meta_author'): result = result + utils.clean_soup(d) + ' ' for d in c.select('div.jeg_meta_date > a'): result = result + utils.clean_soup(d) + ' ' for d in c.select('div.jeg_meta_category > span > a'): result = result + utils.clean_soup(d) + ' ' for c in t.select('div.jeg_post_tags > a'): # result = result + utils.clean_soup(c) + ' ' result = result + '\",' result = result + '\"title\":\"' for c in t.select('div.entry-header > h1.jeg_post_title'): # result = result + utils.clean_soup(c) result = result + '\"' result = result + '},' result = result + '\"text\":\"' for c in t.find_all('div', class_='content-inner'): for d in c.find_all(class_=None, recursive=False): result = result + utils.clean_soup(d) + ' ' result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found voicenews.gr...') # webdriver options = FirefoxOptions() options.add_argument("--headless") driver = webdriver.Firefox(options=options) driver.implicitly_wait(5) driver.get(curr_url) try: print('deploying webdriver...') WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, 'brm-more-link'))) driver.execute_script( "document.getElementsByClassName('brm-more-link')[0].scrollIntoView();setTimeout(function(){},2000);" ) driver.execute_script( "document.getElementsByClassName('brm-more-link')[0].click();setTimeout(function(){},2000);" ) soup = BeautifulSoup(driver.page_source, "html.parser") except: # print('webdriver timeout... ') driver.close() # article for t in soup.find_all('div', class_='post-inner'): if len(soup.find_all('body', class_='single-post')) > 0: print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' result = result + '\"meta\":\"' for c in t.find_all('p', class_='post-meta'): for d in c.select('span.tie-date'): result = result + utils.clean_soup(d) + ' ' for d in c.select('span.post-cats > a'): result = result + utils.clean_soup(d) + ' ' result = result + '\",' result = result + '\"title\":\"' for c in t.select('div.post-inner > h1.post-title'): # result = result + utils.clean_soup(c) result = result + '\"' result = result + '},' result = result + '\"text\":\"' for c in t.find_all('div', class_='entry'): for d in c.find_all(class_=None, recursive=False): result = result + utils.clean_soup(d) + ' ' for c in t.find_all('div', class_='brm'): for d in c.find_all(class_=None, recursive=False): result = result + utils.clean_soup(d) + ' ' result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found pentapostagma.gr...') ds_url = '' # article for t in soup.find_all('article', class_='article'): if len(soup.find_all('div', class_='article__body')) > 0: print('Getting drupal article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' result = result + '\"meta\":\"' for c in t.find_all('div', class_='article__top-details'): for d in c.select('time.default-date'): result = result + utils.clean_soup(d) + ' ' break for d in c.select('a.default-category'): result = result + utils.clean_soup(d) + '' result = result + '\",' result = result + '\"title\":\"' for c in t.select('section.article__top > h1'): # result = result + utils.clean_soup(c) result = result + '\"' result = result + '},' result = result + '\"text\":\"' for c in t.find_all('div', class_='article__body'): for d in c.find_all(class_=None, recursive=False): result = result + utils.clean_soup(d) + ' ' result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result) # comments options = FirefoxOptions() options.add_argument("--headless") driver = webdriver.Firefox(options=options) driver.implicitly_wait(5) try: driver.get(curr_url) driver.execute_script( "document.getElementById('disqus_thread').scrollIntoView();") WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, 'disqus_thread'))) cont = BeautifulSoup(driver.page_source, "html.parser") for i in cont.find_all('iframe'): if i.has_attr('src') and i['src'].find('disqus.com/embed') >= 0: ds_url = i['src'] print('found discus thread with url:', ds_url) break except: # print('site webdriver timeout...') try: driver.get(ds_url) WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.CLASS_NAME, 'post-message'))) for i in range(10): print('load more') driver.execute_script( "document.getElementsByClassName('load-more__button')[0].scrollIntoView();" ) driver.execute_script( "document.getElementsByClassName('load-more__button')[0].click();" ) time.sleep(2) except: # print('disqus webdriver timeout...') try: print('Getting disqus comments...') soup = BeautifulSoup(driver.page_source, "html.parser") for t in soup.find_all('div', class_='post-message'): counter += 1 result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"comment\",' result = result + '\"source\":\"' + curr_url + '\"' result = result + '},' result = result + '\"text\":\"' + utils.clean_soup(t) + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result) except: # print('disqus webdriver empty...') driver.close()