def content_adder_thread(table, doc, i): link = doc['link'] soup = souper(link, False) if soup == None: print('request error') time.sleep(1) soup = souper(link, False) if soup == None: print('request failed twice') return try: entry=soup.find('div',{'class':'post-content entry-content cf'}) children = list(entry.children) good_children = [] good_tags = ['p', 'blockquote', 'ul'] for child in children: if child.name in good_tags: if 'twitter-tweet' in child.attrs.get('class',[]): continue good_children.append(child) content = '\n'.join([child.get_text() for child in good_children]).replace('\xa0',' ') _id = doc['_id'] table.update_one(filter={'_id':_id}, update={'$set':{'content':content}}) print('{} : {}'.format(i, content[:70])) except: print('{}: article error'.format(i))
def content_adder_thread(table, doc, i): #ipdb.set_trace() link = doc['link'] if 'content' in doc.keys(): return soup = souper(link, False) if soup == None: print('request error') time.sleep(1) soup = souper(link, False) if soup == None: print('request failed twice') return try: art_body = soup.find( 'div', { 'class': "TwoColumnLayout_container_385r0 Article_content_3kpRX TwoColumnLayout_fluid-left_3DYLH" }) ps = art_body.find_all('p') content = '\n'.join([p.get_text() for p in ps]) _id = doc['_id'] table.update_one(filter={'_id': _id}, update={'$set': { 'content': content }}) print('{} : {}'.format(i, content[:70])) except: print('{}: article error'.format(i))
def content_adder_thread(table, doc, i): link = doc['link'] # if 'date' in doc.keys(): # return soup = souper(link, False) if soup == None: print('request error') time.sleep(1) soup = souper(link, False) if soup == None: print('request failed twice') return try: entry = soup.find('article', {'class': "entry-content"}) children = list(entry.children) good_children = [] good_tags = ['p', 'blockquote', 'ul'] for child in children: if child.name in good_tags: if 'twitter-tweet' in child.attrs.get('class', []): continue good_children.append(child) content = '\n'.join([child.get_text() for child in good_children]).replace('\xa0', ' ') try: date_soup = soup.find('span', {'class': 'dateline'}) date_text = date_soup.get_text() date = str(dt.strptime(date_text[:13], '%b. %d, %Y').date()) except: try: date_soup = soup.find('span', {'class': 'dateline'}) date_text = date_soup.get_text() date = str(dt.strptime(date_text[:12], '%b. %d, %Y').date()) except: print('date_error') date = 'None' _id = doc['_id'] table.update_one(filter={'_id': _id}, update={'$set': { 'content': content, 'date': date }}) print('{} : {}'.format(i, content[:70])) except: print('{}: article error'.format(i))
def content_adder_thread(table, doc, i): link = doc['link'] soup = souper(link, False) if soup == None: print('request error') time.sleep(1) soup = souper(link, False) if soup == None: print('request failed twice') return try: body = soup.find('div', {'id': "MainW"}) #concat nested entry-contents entry = body.find('div', {'class': 'entry-content'}) # the first time I've ever actually found recursion useful def expand_nest(entry): children = list(entry.children) good_children = [] good_tags = ['p', 'blockquote', 'ul'] for child in children: if child.name in good_tags: if ['twitter-tweet'] in child.attrs.values(): continue good_children.append(child) if child.name == 'div' and child.attrs == { 'class': 'entry-content' }: good_children.append(expand_nest(child)) return good_children children = expand_nest(entry) content = '\n'.join([child.get_text() for child in children]) content = content.replace('\xa0', ' ').replace("'", "’").replace('\u200a', ' ') _id = doc['_id'] table.update_one(filter={'_id': _id}, update={'$set': { 'content': content }}) print('{} : {}'.format(i, content[:70])) except: print('{}: article error'.format(i))
def meta_scrape_link(table, link): try: soup = souper(link, False) art_list_soup = soup.find('div', {'class': 'article-list'}) soup_heads = art_list_soup.find_all('article') except: print('page error') return for soup_head in soup_heads: try: doc = {} hl = soup_head.find('h2', {'class': 'title'}) a = hl.find('a') doc['link'] = a['href'] doc['title'] = a.get_text().replace('\n', '').replace('\t', '').strip(' ') doc['source'] = 'bb' try: date_soup = soup_head.find('span', {'class': 'bydate'}) datestring = date_soup.get_text() date = str(dateutil.parser.parse(datestring).date()) except: print('date_error') date = 'None' doc['date'] = date table.insert_one(doc) except: print('card error') continue
def meta_scrape_link(table, link): try: soup = souper(link, False) art_list_soup = soup.find('div', {'class':'small-12 medium-8 columns'}) cards = art_list_soup.find_all('article') except: print('page error') return for card in cards: try: doc = {} h = card.find('h3', {'itemprop':'headline'}) a = h.find('a') doc['link'] = a['href'] doc['title'] = a['title'] doc['source'] = 'od' try: datestring = card.find('time',{'class':'time'})['datetime'] date = str(dateutil.parser.parse(datestring).date()) except: print('date_error') date='None' doc['date'] = date print(doc['title']) table.insert_one(doc) except: print('card error') continue
def meta_scrape_link(table, link): try: soup = souper(link, False) art_list_soup = soup.find('div', {'class': 'page-content-inner clearfix'}) cards = art_list_soup.find_all('div', {'class': 'post-text'}) except: print('page error') return for card in cards: try: doc = {} h = card.find('h3', {'class': 'post-title'}) a = h.find('a') doc['link'] = a['href'] doc['title'] = a.get_text() doc['source'] = 'gp' try: datestring = card.find('span', { 'class': 'post-date' }).get_text() date = str(dateutil.parser.parse(datestring).date()) except: print('date_error') date = 'None' doc['date'] = date print(doc['title']) table.insert_one(doc) except: print('card error') continue
def meta_scraper_thread(table, link): # ipdb.set_trace() soup = souper(link, False) if soup == None: print('request error') time.sleep(1) soup = souper(link, False) if soup == None: print('request failed twice') return try: body_soup = soup.find('div', {'class': 'news-headline-list'}) arts = body_soup.find_all('article') except: print('carding error') return for art in arts: try: card = art.find('div', {'class': 'story-content'}) a = card.find('a') doc = {} doc_link = 'https://www.reuters.com' + a['href'] doc['link'] = doc_link title = a.get_text().replace('\t', '').replace('\n', '') doc['title'] = title date_str = card.find('span', {'class': 'timestamp'}).get_text() date = dt.strptime(date_str, '%b %d %Y') #date = dt.date(date) doc['date'] = date table.insert_one(doc) print(dt.date(date), title) except: print('article error')
def transcript_adder(table): gen = table_grabber(table) for i, doc in enumerate(gen): if not 'content' in doc.keys(): link = doc['link'] _id = doc['_id'] soup = souper(link, on_browser=True) content = soup.find('p').text table.update_one(filter={'_id': _id}, update={'$set': { 'content': content }}) print(i, content[:70])
def super_scrape_link(table, link): xpath = '/html/body/div[5]/div[2]/div/div[2]/div[2]/div/div[3]' try: soup = souper(link, x_ind=xpath) result_soup = soup.find('div', {'class': 'cnn-search__results-list'}) cards = [card for card in result_soup.children if card != '\n'] except: print('page error') return for card in cards: insert_card(card, table) print('Inserted link:{}'.format(link))
def get_content(link): try: soup = souper(link, on_browser=False) art_body = soup.find('div', {'class': 'article-body'}) ps = art_body.find_all('p') text_lines = [] for p in ps: text_lines.append(p.get_text()) content = '\n'.join(text_lines) table.update_one(filter={'_id': _id}, update={'$set': { 'content': content }}) print(i, content[:70]) except: continue
def meta_scraper(table): # link = 'https://newrepublic.com/political-ad-database' soup = souper('../../data/complete.html', saved=True) ad_items = soup.find_all('div', {'class': "campaign-ads-ad"}) for item in ad_items: meta = {} meta['link'] = item.find('a', {'class': "campaign-ad-link"})['href'] meta['title'] = item.find('h3').text meta['supports'] = item.find('h4').text meta['date'] = item.find('time')['datetime'] try: table.insert_one(meta) except (DuplicateKeyError): print('DuplicateKeyError, object already in database:') print("-", meta['title'])
def hp_content_collector(table, gen): while True: try: doc = next(gen) except StopIteration: return except ValueError: time.sleep(0.1) if 'content' in doc.keys(): continue link = doc['link'] _id = doc['_id'] soup = souper( link, x_ind= '//*[@id="us_5a0cb765e4b0c0b2f2f78878"]/div/div[1]/div[4]/div[1]') if soup == None: continue try: date_text = soup.find('span', { 'class': 'timestamp__date--published' }).text[:10] date = datetime.strptime(date_text, '%m/%d/%Y') art_body = soup.find( 'div', {'class': 'entry__text js-entry-text bn-entry-text'}) ps = art_body.find_all('p') text_lines = [] for p in ps: text_lines.append(p.get_text()) content = '\n'.join(text_lines) table.update_one( filter={'_id': _id}, update={'$set': { 'content': content, 'date': date }}) print(content[:70]) except: print('content error') continue
def get_links_from_page(page_link, table, link_set): soup = souper( page_link, x_ind='//*[@id="zone_twilight_upper"]/div/div[1]/div/div/div[2]/h2/a') try: body = soup.find('div', { 'class': 'zone__content bn-zone', 'data-zone': 'twilight_upper' }) cards = soup.find_all( 'div', { 'class': 'bn-card card card--autopilot card--media-left card--twilight' }) for card in cards: try: section = card.find('h3').text if section != 'POLITICS': continue headline = card.find( 'a', {'class': 'card__link bn-card-headline bn-clickable'}) title = headline.get_text() print(title) link = 'https://www.huffingtonpost.com/' + headline['href'] if link in link_set: continue try: author = card.find('span', {'class': 'bn-clickable'}).text except: author = None doc = {'link': link, 'title': title, 'author': author} try: table.insert_one(doc) link_set.add(link) print(title) except (DuplicateKeyError): print('DuplicateKeyError') except: print('oh well') except: print('bummer that page didnt load!')
def _content_scraper(self, doc): link = doc['web_url'] try: soup = st.souper(link, False) except: print('souping error') return try: art_body = soup.find('article', {'id': 'story'}) art_parts = art_body.find_all('div', {'class': 'story-body-supplemental'}) except: print('parting error') return try: children = [] for part in art_parts: children.extend(part.children) content_parts = [] for child in children: if type(child) == bs4.element.Tag: content_parts.extend([i.text for i in child.select('p')]) article_content = '\n'.join(content_parts) if not article_content: return except: print('text collection error') return doc['content'] = article_content self.table.insert_one(doc) print(self.i, ':', article_content[:80]) time.sleep(np.random.random() / 2 + 0.3)
def meta_scrape_link(table, link): try: soup = souper(link, False) art_list_soup = soup.find('ul', {'class': 'articles-list'}) soup_heads = art_list_soup.find_all('h3', {'class': 'hed'}) except: print('page error') return for soup_head in soup_heads: try: doc = {} a = soup_head.find('a') doc['link'] = a['href'] doc['title'] = a.get_text().replace('\n', '').replace('\t', '').strip(' ') doc['source'] = 'mj' table.insert_one(doc) except: print('card error') continue
def meta_scrape_link(table, link): try: soup = souper(link, False) art_list_soup = soup.find('div', { 'id': 'recent-posts', 'class': 'clearfix' }) cards = list(art_list_soup.children)[:-1] except: print('page error') return for card in cards: try: doc = {} h = card.find('h2', {'class': 'entry-title'}) a = h.find('a') doc['link'] = a['href'] doc['title'] = a['title'] if 'video' in doc['title'].lower(): continue doc['source'] = 'ai' try: datestring = card.find( 'time', {'class': 'entry-date published updated'})['datetime'] date = str(dateutil.parser.parse(datestring).date()) except: print('date_error') date = 'None' doc['date'] = date print(doc['title']) table.insert_one(doc) except: print('card error') continue