def scrape(curr_url, hash, soup, results): print('Found gaceta.es...') # article for t in soup.find_all('div', class_='article-post-content'): if len(t.find_all('h1', class_='entry-title')) > 0: print('Getting wordpress article...') dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'article' dm["source"] = curr_url for c in t.find_all('time', class_='date'): dm["meta"] = utils.clean_soup(c) for c in t.find_all('h1', class_='entry-title'): dm["title"] = utils.clean_soup(c) dt["meta"] = dm for c in t.find_all('div', class_='post-content'): dt["text"] = utils.clean_soup(c) result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found ilpopulista.it...') for t in soup.find_all('body', class_='news'): if len(soup.find_all('div', class_='vc_article_body')) > 0: print('Getting custom article...') dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'article' dm["source"] = curr_url dm["meta"] = '' for c in t.find_all('p', class_='autore_articolo'): dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' ' for c in t.find_all('p', class_='data_articolo'): dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' ' dm["title"] = '' for c in t.find_all('h1', class_='titolo_articolo'): dm["title"] = dm["title"] + utils.clean_soup(c) + ' ' dt["meta"] = dm dt["text"] = '' for c in t.find_all('div', class_='vc_article_body'): dt["text"] = dt["text"] + utils.clean_soup(c) + ' ' result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found mediterraneodigital.com...') # article for t in soup.find_all('div', class_='item-page'): print('Getting joomla article...') dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'article' dm["source"] = curr_url dm["meta"] = '' for c in t.find_all('dl', class_='article-info'): dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' ' dm["title"] = '' for c in t.find_all('h1', class_='article-title'): dm["title"] = dm["title"] + utils.clean_soup(c) + ' ' dt["meta"] = dm dt["text"] = '' for c in t.find_all('section', class_='article-content'): for d in c.find_all('p', class_=''): dt["text"] = dt["text"] + utils.clean_soup(d) + ' ' result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found arxaiaithomi.gr...') # article for t in soup.find_all('div', class_='post'): if len(soup.find_all('body', class_='single-post')) > 0: print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' result = result + '\"meta\":\"' for c in t.find_all('div', class_='post-footer'): # result = result + utils.clean_soup(c) result = result + '\",' result = result + '\"title\":\"' for c in t.select('div.post-headline > h2'): # result = result + utils.clean_soup(c) result = result + '\"' result = result + '},' result = result + '\"text\":\"' for c in t.find_all('div', class_='post-bodycopy'): for d in c.find_all(recursive=False): if d.name != 'div': result = result + utils.clean_soup(d) + ' ' result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found periodistadigital.com...') for t in soup.find_all('div', id='m4p-post-detail'): print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' for c in t.find_all('div', class_='m4p-author_time'): result = result + '\"meta\":\"' + utils.clean_soup(c) result = result + '\",' for c in t.find_all('h1', class_='m4p-size-1'): result = result + '\"title\":\"' + utils.clean_soup(c) result = result + '\"' result = result + '},' for c in t.find_all('div', class_='m4p-post-content'): result = result + '\"text\":\"' + utils.clean_soup(c) + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found la7.it...') for t in soup.find_all('body', class_='node-type-la7-video'): print('Getting drupal article...') dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'article' dm["source"] = curr_url dm["meta"] = '' for c in t.find_all('div', class_='infoVideoRow'): for d in c.find_all('div', class_='dateVideo'): dm["meta"] = dm["meta"] + utils.clean_soup(d) + ' ' dm["title"] = '' for c in t.find_all('div', class_='infoVideoRow'): for d in c.find_all('h1'): dm["title"] = dm["title"] + utils.clean_soup(d) + ' ' dt["meta"] = dm dt["text"] = '' for c in t.find_all('div', class_='occhiello'): for d in c.find_all('p'): dt["text"] = dt["text"] + utils.clean_soup(d) + ' ' result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found rainews.it...') # article for t in soup.find_all('div', class_='boxArticle'): if len(t.find_all('article', class_='')) > 0: print('Getting custom article...') dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'article' dm["source"] = curr_url dm["meta"] = '' for c in t.find_all('div', class_='text'): for d in c.find_all('time', class_='articleDate'): dm["meta"] = dm["meta"] + utils.clean_soup(d) + ' ' dm["title"] = '' for c in t.find_all('div', class_='title'): for d in c.find_all('h1', class_=''): dm["title"] = dm["title"] + utils.clean_soup(d) + ' ' dt["meta"] = dm dt["text"] = '' for c in t.find_all('div', class_='text'): # dt["text"] = dt["text"] + utils.clean_soup(c) + ' ' result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found ilprimatonazionale.it...') # article for t in soup.find_all('article', class_='post'): if len(t.find_all('div', class_='td-post-content')) > 0: print('Getting wordpress article...') dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'article' dm["source"] = curr_url dm["meta"] = '' for c in t.find_all('div', class_='td-module-meta-info'): for d in c.find_all('span', class_='td-post-date'): dm["meta"] = dm["meta"] + utils.clean_soup(d) + ' ' for d in c.find_all('div', class_='td-post-author-name'): dm["meta"] = dm["meta"] + utils.clean_soup(d) + ' ' break dm["title"] = '' for c in t.find_all('h1', class_='entry-title'): # dm["title"] = dm["title"] + utils.clean_soup(c) + ' ' dt["meta"] = dm dt["text"] = '' for c in t.find_all('div', class_='td-post-content'): for d in c.find_all('p', class_=''): dt["text"] = dt["text"] + utils.clean_soup(d) + ' ' result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found okdiario.com...') for t in soup.find_all('article', class_='post'): if len(t.find_all('div', class_='entry-content')) > 0: print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' result = result + '\"meta\":\"' for c in t.find_all('address', class_='autor'): result = result + utils.clean_soup(c) result = result + '\",' result = result + '\"title\":\"' for c in t.find_all('h1', class_='entry-title'): result = result + utils.clean_soup(c) result = result + '\"' result = result + '},' for c in t.find_all('div', class_='entry-content'): result = result + '\"text\":\"' + utils.clean_soup(c) + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found ekklisiaonline.gr...') # article for t in soup.find_all('body', class_='single-post'): if len(soup.find_all('article', class_='')) > 0: print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' result = result + '\"meta\":\"' for c in t.select('h6.entry-date'): # result = result + utils.clean_soup(c) + ' ' result = result + '\",' result = result + '\"title\":\"' for c in t.find_all('h1', class_='entry-title'): result = result + utils.clean_soup(c) break result = result + '\"' result = result + '},' result = result + '\"text\":\"' for c in t.select('div#article > article'): for d in c.find_all('p', class_=None, recursive=False): result = result + utils.clean_soup(d) + ' ' result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found fratelli-italia.it...') for t in soup.find_all('body', class_='single-post'): print('Getting wordpress article...') dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'article' dm["source"] = curr_url dm["meta"] = '' for c in t.find_all('ul', class_='post-options'): for d in c.find_all('time'): dm["meta"] = dm["meta"] + utils.clean_soup(d) + ' ' for c in t.find_all('div', class_='post-tags'): for d in c.find_all('a'): dm["meta"] = dm["meta"] + utils.clean_soup(d) + ' ' dm["title"] = '' for c in t.find('div', id='wrappermain-cs').find( 'div', class_='breadcrumb').find_all('h1', class_='cs-page-title'): dm["title"] = dm["title"] + utils.clean_soup(c) + ' ' dt["meta"] = dm dt["text"] = '' for c in t.find('article', class_='type-post').find( 'div', class_='detail_text').find_all('p'): dt["text"] = dt["text"] + utils.clean_soup(c) + ' ' result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found lasvocesdelpueblo.com...') for t in soup.find_all('div', class_='wrap-content'): if len(t.find_all('div', class_='entry-content-inner')) > 0: print('Getting wordpress article...') dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'article' dm["source"] = curr_url for c in t.find_all('div', class_='avatar-meta'): dm["meta"] = utils.clean_soup(c) for c in t.find_all('h1', class_='entry-title'): dm["title"] = utils.clean_soup(c) dt["meta"] = dm for c in t.find_all('div', class_='entry-content-inner'): dt["text"] = utils.clean_soup(c) result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found diarioya.es...') # article for t in soup.find_all('div', class_='node-content'): if len(t.find_all('h1', class_='title')) > 0: print('Getting custom article...') dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'article' dm["source"] = curr_url for c in t.find_all('span', class_='article-header__time'): dm["meta"] = utils.clean_soup(c) for c in t.find_all('h1', class_='title'): dm["title"] = utils.clean_soup(c) dt["meta"] = dm for c in t.find_all('div', class_='content'): dt["text"] = utils.clean_soup(c) result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found alertadigital.com...') for t in soup.find_all('div', id='homepost'): if len(t.find_all('div', class_='post')) > 0: print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' for c in t.find_all('div', id='datemeta'): result = result + '\"meta\":\"' + utils.clean_soup(c) + '\",' for c in t.find_all('h2', class_=''): result = result + '\"title\":\"' + utils.clean_soup(c) result = result + '\"' result = result + '},' for c in t.find_all('div', class_='entry'): result = result + '\"text\":\"' + utils.clean_soup(c) + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found elikoncc.info...') # article for t in soup.find_all('body', class_='single-post'): if len(soup.find_all('article', class_='post')) > 0: print('Getting wordpress article...') dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'article' dm["source"] = curr_url dm["meta"] = '' for c in t.find_all('div', class_='post-category'): dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' ' dm["title"] = '' for c in t.find_all('h1', class_='entry-title'): dm["title"] = dm["title"] + utils.clean_soup(c) + ' ' dt["meta"] = dm dt["text"] = '' for c in t.find_all('div', class_='entry-content'): for d in c.find_all(class_=None, recursive=False): dt["text"] = dt["text"] + utils.clean_soup(d) + ' ' for c in t.find_all('div', class_='single-content'): for d in c.find_all(class_=None, recursive=False): dt["text"] = dt["text"] + utils.clean_soup(d) + ' ' result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found defencereview.gr...') # article for t in soup.find_all('body', class_='single-post'): if len(soup.find_all('article', class_='post')) > 0: print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' result = result + '\"meta\":\"' for c in t.find_all('div', class_='newsmag-post-meta'): for d in c.select('a'): result = result + utils.clean_soup(d) + ' ' for d in c.select('div.newsmag-date'): result = result + utils.clean_soup(d) + ' ' result = result + '\",' result = result + '\"title\":\"' for c in t.select('div.newsmag-custom-header'): # result = result + utils.clean_soup(c) result = result + '\"' result = result + '},' result = result + '\"text\":\"' for c in t.find_all('div', class_='entry-content'): for d in c.find_all(class_=None, recursive=False): result = result + utils.clean_soup(d) + ' ' result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result) # comments for t in soup.find_all('div', id='comments'): print('Getting wordpress comments...') for c in t.find_all('div', class_='comment-content'): result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"comment\",' result = result + '\"source\":\"' + curr_url + '\"' result = result + '},' result = result + '\"text\":\"' for d in c.find_all('p', class_=''): # result = result + utils.clean_soup(d) result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found destra.it...') # article for t in soup.find_all('div', class_='single'): print('Getting wordpress article...') dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'article' dm["source"] = curr_url dm["meta"] = '' for c in t.find_all('div', class_='post-meta'): for d in c.find_all('span', class_='post-author'): dm["meta"] = dm["meta"] + utils.clean_soup(d) + ' ' for c in t.find_all('div', class_='post-meta'): for d in c.find_all('span', class_='post-date'): dm["meta"] = dm["meta"] + utils.clean_soup(d) + ' ' dm["title"] = '' for c in t.find_all('div', class_='post-meta'): for d in c.find_all('h1', class_=''): dm["title"] = dm["title"] + utils.clean_soup(d) + ' ' dt["meta"] = dm dt["text"] = '' for c in t.find_all('div', class_='post-content'): # dt["text"] = dt["text"] + utils.clean_soup(c) + ' ' result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result) # comments for t in soup.find_all('div', id='comments'): print('Getting wordpress comments...') for c in t.find_all('div', class_='comment-text'): dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'comment' dm["source"] = curr_url dt["meta"] = dm dt["text"] = '' for d in c.find_all('p', class_=''): dt["text"] = dt["text"] + utils.clean_soup(d) + ' ' result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found identità .it...') # article for t in soup.find_all('body', class_='single-post'): if len(soup.find_all('article', class_='single')) > 0: print('Getting wordpress article...') dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'article' dm["source"] = curr_url dm["meta"] = '' for c in t.find_all('div', class_='entry-meta'): # dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' ' dm["title"] = '' for c in t.find_all('header', class_=''): for d in c.find_all('h2', class_=''): dm["title"] = dm["title"] + utils.clean_soup(d) + ' ' dt["meta"] = dm dt["text"] = '' for c in t.find_all('div', class_='entry-content'): for d in c.find_all('p', class_=''): dt["text"] = dt["text"] + utils.clean_soup(d) + ' ' result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result) # comments for t in soup.find_all('div', id='comments'): print('Getting wordpress comments...') for c in t.find_all('div', class_='comment-text'): dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'comment' dm["source"] = curr_url dt["meta"] = dm dt["text"] = '' for d in c.find_all('p', class_=''): dt["text"] = dt["text"] + utils.clean_soup(d) + ' ' result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found olympia.gr...') # reload with selenium with webdriver.Firefox() as driver: try: driver.implicitly_wait(5) driver.maximize_window() driver.get(curr_url) driver.find_element_by_tag_name('body').send_keys(Keys.END) WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'disqus_thread'))) content = driver.page_source # for i in cont.find_all('iframe'): # if i.has_attr('src') and i['src'].find('disqus.com/embed') >= 0: # ds_url = i['src'] # print('found discus thread with url:', ds_url) # break except: content = '' print('webdriver timeout... ') driver.close() # article for t in BeautifulSoup(content, "html.parser").find_all('article', class_='post'): if len(soup.find_all('body', class_='single-post')) > 0: print('Getting wordpress article...') dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'article' dm["source"] = curr_url dm["meta"] = '' for c in t.select('div.tdb-block-inner > time.entry-date'): dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' ' for c in t.select('ul.tdb-tags > li > a'): dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' ' dm["title"] = '' for c in t.select('h1.tdb-title-text'): dm["title"] = dm["title"] + utils.clean_soup(c) + ' ' dt["meta"] = dm dt["text"] = '' for c in t.select('div.wpb_wrapper > div.tdb_single_content > div.tdb-block-inner'): for d in c.find_all('p', class_=None): dt["text"] = dt["text"] + utils.clean_soup(d) + ' ' result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found disidentia.com...') counter = 0 # article for t in soup.find_all('article', class_='type-post'): print('Getting wordpress article...') counter += 1 result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + str(counter) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' for c in t.find_all('div', class_='td-module-meta-info'): result = result + '\"meta\":\"' + utils.clean_soup(c) + '\",' break for c in t.find_all('h1', class_='entry-title'): result = result + '\"title\":\"' + utils.clean_soup(c) break result = result + '\"' result = result + '},' for c in t.find_all('div', class_='td-post-content tagdiv-type'): result = result + '\"text\":\"' for d in c.find_all('p', class_=''): result = result + utils.clean_soup(d) result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result) # comments if len(soup.find_all('ol', class_='comment-list')) > 0: print('Getting custom comments...') for t in soup.find_all('div', class_='comment-content'): counter += 1 result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + str(counter) + '\",' result = result + '\"type\":\"comment\",' result = result + '\"source\":\"' + curr_url + '\"' result = result + '},' result = result + '\"text\":\"' + utils.clean_soup(t) + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found espana2000.es...') # article for t in soup.find_all('body', class_='single-post'): # if t.has_attr('id') and t['id'].find('post') >= 0: print('Getting wordpress article...') dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'article' dm["source"] = curr_url dm["meta"] = '' for c in t.find_all('div', class_='post-meta-wrapper'): dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' ' dm["title"] = '' for c in t.find_all('h1', class_='entry-title'): dm["title"] = dm["title"] + utils.clean_soup(c) + ' ' dt["meta"] = dm dt["text"] = '' for c in t.find_all('div', class_='entry-content'): dt["text"] = dt["text"] + utils.clean_soup(c) + ' ' result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result) # comments for t in soup.find_all('div', class_='comments-wrapper'): print('Getting wordpress comments...') for c in t.find_all('div', class_='comment-content'): dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'comment' dm["source"] = curr_url dt["meta"] = dm dt["text"] = '' for d in c.find_all('p', class_=''): dt["text"] = dt["text"] + utils.clean_soup(d) + ' ' result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found somatemps.me...') # article for t in soup.find_all('article', class_='type-post'): print('Getting wordpress article...') dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'article' dm["source"] = curr_url dm["meta"] = '' for c in t.find_all('p', class_='postmetadata'): dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' ' dm["title"] = '' for c in t.find_all('h1', class_='posttitle'): dm["title"] = dm["title"] + utils.clean_soup(c) + ' ' dt["meta"] = dm dt["text"] = '' for c in t.find_all('section', class_='entry'): for d in c.find_all('p', classs_=''): dt["text"] = dt["text"] + utils.clean_soup(d) + ' ' result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result) # comments for t in soup.find_all('ol', class_='commentlist'): print('Getting wordpress comments...') for c in t.find_all('div', class_='comment-wrapper'): dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'comment' dm["source"] = curr_url dt["meta"] = dm dt["text"] = '' for d in c.find_all('p', class_=''): dt["text"] = dt["text"] + utils.clean_soup(d) + ' ' result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found katohika.gr...') # article for t in soup.find_all('div', id='content'): if len(soup.find_all('div', class_='entry-content')) > 0: print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' for c in t.find_all('div', class_='entry-author'): result = result + '\"meta\":\"' + utils.clean_soup(c) + '\",' for c in t.find_all('h1', class_='entry-title'): result = result + '\"title\":\"' + utils.clean_soup(c) + '\"' result = result + '},' result = result + '\"text\":\"' for c in t.find_all('div', class_='entry-content'): for d in c.find_all(class_=None, recursive=False): result = result + utils.clean_soup(d) + ' ' result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result) # comments for t in soup.find_all('div', id='comments-section'): print('Getting wordpress comments...') for c in t.find_all('div', class_='comment-content'): result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"comment\",' result = result + '\"source\":\"' + curr_url + '\"' result = result + '},' result = result + '\"text\":\"' for d in c.find_all('p', class_=''): result = result + utils.clean_soup(d) result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found makeleio.gr...') # article for t in soup.find_all('div', class_='single-style1-wrap'): print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' for c in t.find_all('div', class_='single-style1-meta-tag'): result = result + '\"meta\":\"' + utils.clean_soup(c) + '\",' for c in t.find_all('div', class_='single-style1-title'): result = result + '\"title\":\"' + utils.clean_soup(c) + '\"' result = result + '},' for c in t.find_all('div', class_='single-style1-content'): result = result + '\"text\":\"' + utils.clean_soup(c) + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result) # comments for t in soup.find_all('div', class_='comments-area'): print('Getting wordpress comments...') for c in t.find_all('div', class_='comment-content'): result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"comment\",' result = result + '\"source\":\"' + curr_url + '\"' result = result + '},' result = result + '\"text\":\"' for d in c.find_all('p', class_=''): result = result + utils.clean_soup(d) result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found sioeeu.wordpress.com...') # article for t in soup.select('article.type-post.format-standard'): if len(t.select('div.single-entry-content > p')) > 0: print('Getting wordpress article...') dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'article' dm["source"] = curr_url for c in t.select('header.single-entry-header > p'): dm["meta"] = utils.clean_soup(c) for c in t.find_all('h1', class_='entry-title'): dm["title"] = utils.clean_soup(c) dt["meta"] = dm dt["text"] = '' for c in t.select('div.single-entry-content > p'): dt["text"] = dt["text"] + ' ' + utils.clean_soup(c) result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result) # comments for t in soup.find_all('div', id='comments'): print('Getting wordpress comments...') for c in t.select('div.comment-body > p'): dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'comment' dm["source"] = curr_url dt["meta"] = dm dt["text"] = utils.clean_soup(c) result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found youtube...') # load and manipulate the website with webdriver.Firefox(options=FirefoxOptions()) as driver: driver.maximize_window() driver.implicitly_wait(5) ac = ActionChains(driver) # load the website try: driver.get(curr_url) time.sleep(5) # WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.ID, "info-text"))).click() # WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.TAG_NAME, "body"))).send_keys(Keys.DOWN) # for item in driver.find_elements_by_tag_name('body'): # ac.move_to_element(item).move_by_offset(5, 5).click().perform() # time.sleep(1) for item in range(10): driver.find_element_by_tag_name('body').send_keys(Keys.DOWN) time.sleep(1) for item in range(10): driver.find_element_by_tag_name('body').send_keys(Keys.END) time.sleep(5) content = driver.page_source except: print('webdriver timeout... ') content = '' # close the driver driver.close() # parse the comments for t in BeautifulSoup(content, "html.parser").find_all( 'yt-formatted-string', id='content-text'): # class_='post-message'): dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'yt_comment' dm["source"] = curr_url dt["meta"] = dm dt["text"] = utils.clean_soup(t) result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found termometropolitico.it...') for t in soup.find_all('body', class_='single-post'): # if t.has_attr('id') and t['id'].find('post') >= 0: print('Getting wordpress article...') dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'article' dm["source"] = curr_url for c in t.find_all('div', class_='single_info'): dm["meta"] = utils.clean_soup(c) for c in t.find_all('h1', class_='single_title'): dm["title"] = utils.clean_soup(c) dt["meta"] = dm for c in t.find_all('div', class_='single_content'): dt["text"] = utils.clean_soup(c) result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found imolaoggi.it...') # article for t in soup.find_all('article', class_='post'): if len(t.find_all('h1', class_='entry-title')) > 0: print('Getting wordpress article...') dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'article' dm["source"] = curr_url dm["meta"] = '' for c in t.find_all('span', class_='post-author'): dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' ' for c in t.find_all('span', class_='posted-on'): dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' ' for c in t.find_all('span', class_='cat-links'): dm["meta"] = dm["meta"] + utils.clean_soup(c) + ' ' dm["title"] = '' for c in t.find_all('h1', class_='entry-title'): dm["title"] = dm["title"] + utils.clean_soup(c) + ' ' dt["meta"] = dm dt["text"] = '' for c in t.find_all(class_=''): dt["text"] = dt["text"] + utils.clean_soup(c) + ' ' result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result) # comments for t in soup.find_all('ol', class_='commentlist'): print('Getting wordpress comments...') for c in t.find_all('li', class_='comment'): dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'comment' dm["source"] = curr_url dt["meta"] = dm dt["text"] = '' for d in c.find_all('p', class_=''): dt["text"] = dt["text"] + utils.clean_soup(d) + ' ' result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found a website...') # load and manipulate the website with webdriver.Firefox(options=FirefoxOptions()) as driver: driver.maximize_window() driver.implicitly_wait(5) # load the website try: driver.get(curr_url) for item in range(10): driver.find_element_by_tag_name('body').send_keys(Keys.DOWN) time.sleep(1) driver.find_element_by_tag_name('body').send_keys(Keys.END) time.sleep(1) soup = BeautifulSoup(driver.page_source, "html.parser") content = driver.page_source except: print('webdriver timeout... ') content = '' # close the driver driver.close() # parse the data dt = {} dm = {} dm["id"] = str(hash) dm["type"] = 'web_unstructured' dm["source"] = curr_url dt["meta"] = dm dt["text"] = utils.clean_soup(BeautifulSoup(content, "html.parser")) result = json.dumps(dt, ensure_ascii=False) results.append(result) print(result)
def scrape(curr_url, hash, soup, results): print('Found hellenicns.gr...') # articles for t in soup.find_all('body', class_='single-post'): if len(soup.find_all('article', class_='post')) > 0: print('Getting wordpress article...') result = '{\"meta\":{' result = result + '\"id\":\"' + str(hash) + '\",' result = result + '\"type\":\"article\",' result = result + '\"source\":\"' + curr_url + '\",' result = result + '\"meta\":\"' for c in t.find_all('div', class_='below-entry-meta'): for d in c.select('time.published'): result = result + utils.clean_soup(d) + ' ' for d in c.select('span.author > a'): result = result + utils.clean_soup(d) + ' ' for d in c.select('span.tag-links'): result = result + utils.clean_soup(d) + ' ' break result = result + '\",' result = result + '\"title\":\"' for c in t.find_all('h1', class_='entry-title'): result = result + utils.clean_soup(c) break result = result + '\",' result = result + '\"text\":\"' for c in t.find_all('div', class_='entry-content'): for d in c.find_all(class_=None, recursive=False): result = result + utils.clean_soup(d) + ' ' for c in t.find_all('div', class_='single-content'): for d in c.find_all(class_=None, recursive=False): result = result + utils.clean_soup(d) + ' ' result = result + '\"' result = result + '}' result = utils.clean_whitespaces(result) results.append(result) print(result)