Exemple #1
0
def content_adder_thread(table, doc, i):
    link = doc['link']

    soup = souper(link, False)
    if soup == None:
        print('request error')
        time.sleep(1)
        soup = souper(link, False)
        if soup == None:
            print('request failed twice')
            return
    try:
        entry=soup.find('div',{'class':'post-content entry-content cf'})
        children = list(entry.children)
        good_children = []
        good_tags = ['p', 'blockquote', 'ul']

        for child in children:
            if child.name in good_tags:
                if 'twitter-tweet' in child.attrs.get('class',[]):
                    continue
                good_children.append(child)

        content = '\n'.join([child.get_text() for child in good_children]).replace('\xa0',' ')

        _id = doc['_id']
        table.update_one(filter={'_id':_id}, update={'$set':{'content':content}})
        print('{} : {}'.format(i, content[:70]))
    except:
        print('{}: article error'.format(i))
def content_adder_thread(table, doc, i):
    #ipdb.set_trace()

    link = doc['link']
    if 'content' in doc.keys():
        return

    soup = souper(link, False)
    if soup == None:
        print('request error')
        time.sleep(1)
        soup = souper(link, False)
        if soup == None:
            print('request failed twice')
            return

    try:
        art_body = soup.find(
            'div', {
                'class':
                "TwoColumnLayout_container_385r0 Article_content_3kpRX TwoColumnLayout_fluid-left_3DYLH"
            })
        ps = art_body.find_all('p')
        content = '\n'.join([p.get_text() for p in ps])
        _id = doc['_id']
        table.update_one(filter={'_id': _id},
                         update={'$set': {
                             'content': content
                         }})
        print('{} : {}'.format(i, content[:70]))
    except:
        print('{}: article error'.format(i))
Exemple #3
0
def content_adder_thread(table, doc, i):
    link = doc['link']
    # if 'date' in doc.keys():
    #     return

    soup = souper(link, False)
    if soup == None:
        print('request error')
        time.sleep(1)
        soup = souper(link, False)
        if soup == None:
            print('request failed twice')
            return
    try:
        entry = soup.find('article', {'class': "entry-content"})
        children = list(entry.children)
        good_children = []
        good_tags = ['p', 'blockquote', 'ul']

        for child in children:
            if child.name in good_tags:
                if 'twitter-tweet' in child.attrs.get('class', []):
                    continue
                good_children.append(child)

        content = '\n'.join([child.get_text()
                             for child in good_children]).replace('\xa0', ' ')

        try:
            date_soup = soup.find('span', {'class': 'dateline'})
            date_text = date_soup.get_text()
            date = str(dt.strptime(date_text[:13], '%b. %d, %Y').date())
        except:
            try:
                date_soup = soup.find('span', {'class': 'dateline'})
                date_text = date_soup.get_text()
                date = str(dt.strptime(date_text[:12], '%b. %d, %Y').date())
            except:
                print('date_error')
                date = 'None'

        _id = doc['_id']
        table.update_one(filter={'_id': _id},
                         update={'$set': {
                             'content': content,
                             'date': date
                         }})
        print('{} : {}'.format(i, content[:70]))
    except:
        print('{}: article error'.format(i))
def content_adder_thread(table, doc, i):
    link = doc['link']

    soup = souper(link, False)
    if soup == None:
        print('request error')
        time.sleep(1)
        soup = souper(link, False)
        if soup == None:
            print('request failed twice')
            return
    try:
        body = soup.find('div', {'id': "MainW"})

        #concat nested entry-contents
        entry = body.find('div', {'class': 'entry-content'})

        # the first time I've ever actually found recursion useful
        def expand_nest(entry):
            children = list(entry.children)
            good_children = []
            good_tags = ['p', 'blockquote', 'ul']

            for child in children:
                if child.name in good_tags:
                    if ['twitter-tweet'] in child.attrs.values():
                        continue
                    good_children.append(child)
                if child.name == 'div' and child.attrs == {
                        'class': 'entry-content'
                }:
                    good_children.append(expand_nest(child))
            return good_children

        children = expand_nest(entry)

        content = '\n'.join([child.get_text() for child in children])
        content = content.replace('\xa0',
                                  ' ').replace("'",
                                               "’").replace('\u200a', ' ')

        _id = doc['_id']
        table.update_one(filter={'_id': _id},
                         update={'$set': {
                             'content': content
                         }})
        print('{} : {}'.format(i, content[:70]))
    except:
        print('{}: article error'.format(i))
def meta_scrape_link(table, link):
    try:
        soup = souper(link, False)
        art_list_soup = soup.find('div', {'class': 'article-list'})
        soup_heads = art_list_soup.find_all('article')
    except:
        print('page error')
        return

    for soup_head in soup_heads:
        try:
            doc = {}
            hl = soup_head.find('h2', {'class': 'title'})
            a = hl.find('a')
            doc['link'] = a['href']
            doc['title'] = a.get_text().replace('\n',
                                                '').replace('\t',
                                                            '').strip(' ')
            doc['source'] = 'bb'
            try:
                date_soup = soup_head.find('span', {'class': 'bydate'})
                datestring = date_soup.get_text()
                date = str(dateutil.parser.parse(datestring).date())
            except:
                print('date_error')
                date = 'None'
            doc['date'] = date
            table.insert_one(doc)

        except:
            print('card error')
            continue
Exemple #6
0
def meta_scrape_link(table, link):
    try:
        soup = souper(link, False)
        art_list_soup = soup.find('div', {'class':'small-12 medium-8 columns'})
        cards = art_list_soup.find_all('article')
    except:
        print('page error')
        return

    for card in cards:
        try:
            doc = {}
            h = card.find('h3', {'itemprop':'headline'})
            a = h.find('a')
            doc['link'] = a['href']
            doc['title'] = a['title']
            doc['source'] = 'od'
            try:
                datestring = card.find('time',{'class':'time'})['datetime']
                date = str(dateutil.parser.parse(datestring).date())
            except:
                print('date_error')
                date='None'
            doc['date'] = date
            print(doc['title'])
            table.insert_one(doc)
        except:
            print('card error')
            continue
def meta_scrape_link(table, link):
    try:
        soup = souper(link, False)
        art_list_soup = soup.find('div',
                                  {'class': 'page-content-inner clearfix'})
        cards = art_list_soup.find_all('div', {'class': 'post-text'})
    except:
        print('page error')
        return

    for card in cards:
        try:
            doc = {}
            h = card.find('h3', {'class': 'post-title'})
            a = h.find('a')
            doc['link'] = a['href']
            doc['title'] = a.get_text()
            doc['source'] = 'gp'
            try:
                datestring = card.find('span', {
                    'class': 'post-date'
                }).get_text()
                date = str(dateutil.parser.parse(datestring).date())
            except:
                print('date_error')
                date = 'None'
            doc['date'] = date
            print(doc['title'])
            table.insert_one(doc)
        except:
            print('card error')
            continue
def meta_scraper_thread(table, link):
    # ipdb.set_trace()

    soup = souper(link, False)
    if soup == None:
        print('request error')

        time.sleep(1)
        soup = souper(link, False)
        if soup == None:
            print('request failed twice')
            return

    try:
        body_soup = soup.find('div', {'class': 'news-headline-list'})
        arts = body_soup.find_all('article')
    except:
        print('carding error')
        return

    for art in arts:
        try:
            card = art.find('div', {'class': 'story-content'})
            a = card.find('a')
            doc = {}

            doc_link = 'https://www.reuters.com' + a['href']
            doc['link'] = doc_link

            title = a.get_text().replace('\t', '').replace('\n', '')
            doc['title'] = title

            date_str = card.find('span', {'class': 'timestamp'}).get_text()
            date = dt.strptime(date_str, '%b %d %Y')
            #date = dt.date(date)
            doc['date'] = date

            table.insert_one(doc)
            print(dt.date(date), title)
        except:
            print('article error')
Exemple #9
0
def transcript_adder(table):
    gen = table_grabber(table)

    for i, doc in enumerate(gen):
        if not 'content' in doc.keys():
            link = doc['link']
            _id = doc['_id']
            soup = souper(link, on_browser=True)
            content = soup.find('p').text
            table.update_one(filter={'_id': _id},
                             update={'$set': {
                                 'content': content
                             }})
            print(i, content[:70])
Exemple #10
0
def super_scrape_link(table, link):

    xpath = '/html/body/div[5]/div[2]/div/div[2]/div[2]/div/div[3]'

    try:
        soup = souper(link, x_ind=xpath)
        result_soup = soup.find('div', {'class': 'cnn-search__results-list'})
        cards = [card for card in result_soup.children if card != '\n']
    except:
        print('page error')
        return

    for card in cards:
        insert_card(card, table)
    print('Inserted link:{}'.format(link))
def get_content(link):
    try:
        soup = souper(link, on_browser=False)
        art_body = soup.find('div', {'class': 'article-body'})
        ps = art_body.find_all('p')
        text_lines = []
        for p in ps:
            text_lines.append(p.get_text())
        content = '\n'.join(text_lines)
        table.update_one(filter={'_id': _id},
                         update={'$set': {
                             'content': content
                         }})
        print(i, content[:70])
    except:
        continue
Exemple #12
0
def meta_scraper(table):
    # link = 'https://newrepublic.com/political-ad-database'
    soup = souper('../../data/complete.html', saved=True)
    ad_items = soup.find_all('div', {'class': "campaign-ads-ad"})

    for item in ad_items:
        meta = {}
        meta['link'] = item.find('a', {'class': "campaign-ad-link"})['href']
        meta['title'] = item.find('h3').text
        meta['supports'] = item.find('h4').text
        meta['date'] = item.find('time')['datetime']
        try:
            table.insert_one(meta)
        except (DuplicateKeyError):
            print('DuplicateKeyError, object already in database:')
            print("-", meta['title'])
Exemple #13
0
def hp_content_collector(table, gen):

    while True:
        try:
            doc = next(gen)
        except StopIteration:
            return
        except ValueError:
            time.sleep(0.1)

        if 'content' in doc.keys():
            continue

        link = doc['link']
        _id = doc['_id']
        soup = souper(
            link,
            x_ind=
            '//*[@id="us_5a0cb765e4b0c0b2f2f78878"]/div/div[1]/div[4]/div[1]')
        if soup == None:
            continue

        try:
            date_text = soup.find('span', {
                'class': 'timestamp__date--published'
            }).text[:10]
            date = datetime.strptime(date_text, '%m/%d/%Y')

            art_body = soup.find(
                'div', {'class': 'entry__text js-entry-text bn-entry-text'})
            ps = art_body.find_all('p')
            text_lines = []
            for p in ps:
                text_lines.append(p.get_text())
            content = '\n'.join(text_lines)

            table.update_one(
                filter={'_id': _id},
                update={'$set': {
                    'content': content,
                    'date': date
                }})
            print(content[:70])
        except:
            print('content error')

        continue
Exemple #14
0
def get_links_from_page(page_link, table, link_set):
    soup = souper(
        page_link,
        x_ind='//*[@id="zone_twilight_upper"]/div/div[1]/div/div/div[2]/h2/a')

    try:
        body = soup.find('div', {
            'class': 'zone__content bn-zone',
            'data-zone': 'twilight_upper'
        })
        cards = soup.find_all(
            'div', {
                'class':
                'bn-card card card--autopilot card--media-left card--twilight'
            })
        for card in cards:
            try:
                section = card.find('h3').text
                if section != 'POLITICS':
                    continue
                headline = card.find(
                    'a', {'class': 'card__link bn-card-headline bn-clickable'})
                title = headline.get_text()
                print(title)
                link = 'https://www.huffingtonpost.com/' + headline['href']

                if link in link_set:
                    continue

                try:
                    author = card.find('span', {'class': 'bn-clickable'}).text
                except:
                    author = None

                doc = {'link': link, 'title': title, 'author': author}
                try:
                    table.insert_one(doc)
                    link_set.add(link)
                    print(title)
                except (DuplicateKeyError):
                    print('DuplicateKeyError')
            except:
                print('oh well')
    except:
        print('bummer that page didnt load!')
    def _content_scraper(self, doc):
        link = doc['web_url']
        try:
            soup = st.souper(link, False)
        except:
            print('souping error')
            return

        try:
            art_body = soup.find('article', {'id': 'story'})
            art_parts = art_body.find_all('div',
                                          {'class': 'story-body-supplemental'})
        except:
            print('parting error')
            return

        try:
            children = []
            for part in art_parts:
                children.extend(part.children)

            content_parts = []
            for child in children:
                if type(child) == bs4.element.Tag:
                    content_parts.extend([i.text for i in child.select('p')])

            article_content = '\n'.join(content_parts)

            if not article_content:
                return
        except:
            print('text collection error')
            return

        doc['content'] = article_content
        self.table.insert_one(doc)
        print(self.i, ':', article_content[:80])
        time.sleep(np.random.random() / 2 + 0.3)
Exemple #16
0
def meta_scrape_link(table, link):
    try:
        soup = souper(link, False)
        art_list_soup = soup.find('ul', {'class': 'articles-list'})
        soup_heads = art_list_soup.find_all('h3', {'class': 'hed'})
    except:
        print('page error')
        return

    for soup_head in soup_heads:
        try:
            doc = {}
            a = soup_head.find('a')
            doc['link'] = a['href']
            doc['title'] = a.get_text().replace('\n',
                                                '').replace('\t',
                                                            '').strip(' ')
            doc['source'] = 'mj'

            table.insert_one(doc)
        except:
            print('card error')
            continue
def meta_scrape_link(table, link):
    try:
        soup = souper(link, False)
        art_list_soup = soup.find('div', {
            'id': 'recent-posts',
            'class': 'clearfix'
        })
        cards = list(art_list_soup.children)[:-1]
    except:
        print('page error')
        return

    for card in cards:
        try:
            doc = {}
            h = card.find('h2', {'class': 'entry-title'})
            a = h.find('a')
            doc['link'] = a['href']
            doc['title'] = a['title']
            if 'video' in doc['title'].lower():
                continue
            doc['source'] = 'ai'
            try:
                datestring = card.find(
                    'time',
                    {'class': 'entry-date published updated'})['datetime']
                date = str(dateutil.parser.parse(datestring).date())
            except:
                print('date_error')
                date = 'None'
            doc['date'] = date
            print(doc['title'])
            table.insert_one(doc)
        except:
            print('card error')
            continue