Beispiel #1
0
def _copydoc():
    """
    Example view demonstrating rendering a simple HTML page.
    """
    with open(app_config.TRANSCRIPT_HTML_PATH) as f:
        html = f.read()

    doc = CopyDoc(html)
    context = {'doc': doc}

    return make_response(render_template('copydoc.html', **context))
Beispiel #2
0
def _copydoc(filename):
    """
    Example view demonstrating rendering a simple HTML page.
    """
    key = filename.split('.')[0]
    if not os.path.exists('data/%s' % filename):
        abort(404)

    with open(app_config.EPISODE_DOCUMENTS[key]['path']) as f:
        html = f.read()

    doc = CopyDoc(html)
    context = {
        'doc': doc
    }

    return make_response(render_template('copydoc.html', **context))
Beispiel #3
0
class CopyDocSpaces(unittest.TestCase):
    """
    Test bootstrapping postgres database
    """
    def setUp(self):
        with open('tests/transcript_with_embed.html') as f:
            html_string = f.read()

        self.parser = CopyDoc(html_string, TOKENS)
        self.body = self.parser.soup.body

    def test_num_lines(self):
        self.assertEqual(len(self.body.contents), 4)

    def test_iframe_markup(self):
        self.assertTrue(
            '<iframe width="560" height="315" src="https://www.youtube.com/embed/dZTKOBElkyg" frameborder="0" allowfullscreen></iframe>'
            in self.parser.__unicode__())
Beispiel #4
0
def _episode(filename):
    """
    Example view demonstrating rendering a simple HTML page.
    """
    key = filename.split('.')[0]
    if not os.path.exists('data/%s' % filename):
        abort(404)

    with open(app_config.EPISODE_DOCUMENTS[key]['path']) as f:
        html = f.read()

    context = make_context()
    doc = CopyDoc(html)
    parsed_document = parse_doc.parse(doc)
    context.update(parsed_document)
    context.update({
        'episode': key,
        'next': app_config.EPISODE_DOCUMENTS[key]['next']})
    return make_response(render_template('episode.html', **context))
Beispiel #5
0
def parse_document(html):
    doc = CopyDoc(html)
    parsed_document = parse_doc.parse(doc)

    return parsed_document
Beispiel #6
0
def lambda_handler(event, context):
    """
    Retrieves drive keys from the request payload
    - connects to google using authomatic and OAuth2 credentials
    - parses the factcheck document and publishes to staging
    """
    try:
        try:
            logger.info('Start preview generation')
            TRANSCRIPT_GDOC_KEY = event['doc_key']
            AUTHORS_GOOGLE_DOC_KEY = event['authors_key']
        except KeyError:
            msg = 'Did not find needed params in %s' % (event)
            raise app_config.UserException('[BadRequest]: %s' % msg)
        authors_url = app_config.SPREADSHEET_URL_TEMPLATE % (
            AUTHORS_GOOGLE_DOC_KEY)
        doc_url = app_config.DOC_URL_TEMPLATE % (TRANSCRIPT_GDOC_KEY)

        # Get the credentials and refresh if necesary
        credentials = app_config.authomatic.credentials(
            app_config.GOOGLE_CREDS)
        # Refresh credentials if needed
        if not credentials.valid:
            credentials.refresh()

        # Get authors
        response = app_config.authomatic.access(credentials, authors_url)
        if response.status != 200:
            msg = 'While accessing %s got HTTP: %s' % (authors_url,
                                                       response.status)
            raise app_config.UserException('[BadRequest]: %s' % msg)
        authors_data = response.content
        authors = transform_authors(authors_data)
        if not authors:
            msg = 'Could not parse authors spreadsheet %s' % (authors_url)
            raise app_config.UserException('[BadRequest]: %s' % msg)
        # Get doccument
        response = app_config.authomatic.access(credentials, doc_url)
        if response.status != 200:
            msg = 'While accessing %s got HTTP: %s' % (doc_url,
                                                       response.status)
            raise app_config.UserException('[BadRequest]: %s' % msg)
        html = response.content

        # Parse data
        doc = CopyDoc(html)
        logger.info('Parsed doc html with copydoc')
        context = parse_doc.parse(doc, authors)
        logger.info('Parsed factcheck')

        # Generate final files and upload to S3
        upload_template_contents(context, 'factcheck.html')
        upload_template_contents(context, 'share.html')
        context['preview'] = True
        upload_template_contents(context, 'factcheck.html',
                                 'factcheck_preview.html')
        logger.info('Generated factcheck templates. Execution successful')
        return {'message': 'Preview generated successfully'}
    except app_config.UserException, e:
        logger.error('Exit with controlled exception %s' % e)
        raise
Beispiel #7
0
    def setUp(self):
        with open('tests/testdoc.html') as f:
            html_string = f.read()

        self.parser = CopyDoc(html_string, TOKENS)
        self.contents = self.parser.soup.body.contents
Beispiel #8
0
class CopyDocTestCase(unittest.TestCase):
    """
    Test bootstrapping postgres database
    """
    def setUp(self):
        with open('tests/testdoc.html') as f:
            html_string = f.read()

        self.parser = CopyDoc(html_string, TOKENS)
        self.contents = self.parser.soup.body.contents

    def test_num_lines(self):
        self.assertEqual(len(self.contents), 19)

    def test_h1(self):
        self._is_tag(self.contents[0], 'h1')

    def test_h1_has_no_children(self):
        child_length = len(self.contents[0].find_all())
        self.assertEqual(child_length, 0)

    def test_h2(self):
        self._is_tag(self.contents[1], 'h2')

    def test_h3(self):
        self._is_tag(self.contents[2], 'h3')

    def test_p(self):
        self._is_tag(self.contents[3], 'p')

    def test_strong(self):
        self._contains_tag(self.contents[4], 'strong')

    def test_em(self):
        self._contains_tag(self.contents[5], 'em')

    def test_u(self):
        self._contains_tag(self.contents[6], 'u')

    def test_ignore_html(self):
        self._contains_tag(self.contents[7], 'strong', 0)

    def test_a(self):
        self._contains_tag(self.contents[8], 'a')

    def test_a_count(self):
        tags = self.parser.soup.body.findAll('a')
        self.assertEqual(len(tags), 2)

    def test_ahref(self):
        href = self.contents[8].a.attrs['href'][0]
        self.assertEqual(href, 'http://npr.org')

    def test_ul(self):
        self._is_tag(self.contents[9], 'ul')

    def test_ul_li(self):
        count_li = len(self.contents[9].find_all('li'))
        self.assertEqual(count_li, 3)

    def test_ol(self):
        self._is_tag(self.contents[10], 'ol')

    def test_ol_li(self):
        count_li = len(self.contents[10].find_all('li'))
        self.assertEqual(count_li, 3)

    def test_img(self):
        self._contains_tag(self.contents[11], 'img')

    def test_strange_has_no_children(self):
        child_length = len(self.contents[12].find_all())
        self.assertEqual(child_length, 0)

    def test_strange_has_extra_space_bug(self):
        clean_string = self.parser.clean_linebreaks(self.contents[12])
        expected_string = '<p>Strange formatting</p>'
        self.assertEqual(clean_string, expected_string)

    def test_tabletag(self):
        self._is_tag(self.contents[13], 'table')

    def test_tabletd(self):
        self._contains_tag(self.contents[13], 'td', 4)

    def test_tabletr(self):
        self._contains_tag(self.contents[13], 'tr', 2)

    def test_anchortag_combination(self):
        self._contains_tag(self.contents[15], 'a')

    def test_headline_extraction(self):
        self.assertEqual(self.parser.headline, 'this is a headline')

    def test_subhed_extraction(self):
        self.assertEqual(self.parser.subhed, 'this is a subhed')

    def test_banner_extraction(self):
        self.assertEqual(self.parser.banner, 'this is a banner')

    def test_image_extraction(self):
        self.assertEqual(self.parser.image, 'http://media.npr.org/assets/img/2015/12/29/gettyimages-477258926_wide-s700-c85.jpg')

    def test_mobile_image_extraction(self):
        self.assertEqual(self.parser.mobile_image, 'https://media.giphy.com/media/3oEdv5FXteGY8iS8CY/giphy.gif')

    def test_audio_url_extraction(self):
        self.assertEqual(self.parser.audio_url, 'http://play.podtrac.com/npr-510310/npr.mc.tritondigital.com/NPR_510310/media/anon.npr-mp3/npr/nprpolitics/2016/02/20160205_nprpolitics_roundup.mp3?orgId=1&d=2261&p=510310&story=465741966&t=podcast&e=465741966&ft=pod&f=510310')

    def test_credit_extraction(self):
        self.assertEqual(self.parser.credit, 'this is a photo credit')

    def test_mobile_credit_extraction(self):
        self.assertEqual(self.parser.mobile_credit, 'this is a mobile photo credit')

    def test_iframe_markup(self):
        self.assertTrue('<iframe width="560" height="315" src="https://www.youtube.com/embed/659pppwniXA" frameborder="0" allowfullscreen></iframe>' in self.parser.__unicode__())

    def test_nbsp_markup(self):
        self.assertTrue('This is a paragraph with a non-breaking&nbsp;space.' in self.parser.__unicode__())

    def spaces_stripped(self):
        clean_string = self.parser.clean_linebreaks(self.contents[17])
        expected_string = '<p>This is a paragraph with multiple spaces.</p>'
        self.assertEqual(child_length, 0)

    def _is_tag(self, tag, tag_name):
        self.assertEqual(tag.name, tag_name)

    def _contains_tag(self, tag, tag_name, count=1):
        child_length = len(tag.findAll(tag_name))
        self.assertEqual(child_length, count)
Beispiel #9
0
    def setUp(self):
        with open('tests/transcript_with_embed.html') as f:
            html_string = f.read()

        self.parser = CopyDoc(html_string, TOKENS)
        self.body = self.parser.soup.body
Beispiel #10
0
class CopyDocSpaces(unittest.TestCase):
    """
    Test bootstrapping postgres database
    """
    def setUp(self):
        with open('tests/transcript_with_embed.html') as f:
            html_string = f.read()

        self.parser = CopyDoc(html_string, TOKENS)
        self.body = self.parser.soup.body

    def test_num_lines(self):
        self.assertEqual(len(self.body.contents), 4)

    def test_iframe_markup(self):
        self.assertTrue('<iframe width="560" height="315" src="https://www.youtube.com/embed/dZTKOBElkyg" frameborder="0" allowfullscreen></iframe>' in self.parser.__unicode__())
Beispiel #11
0
    def setUp(self):
        with open('tests/testdoc.html') as f:
            html_string = f.read()

        self.parser = CopyDoc(html_string, TOKENS)
        self.contents = self.parser.soup.body.contents
Beispiel #12
0
class CopyDocTestCase(unittest.TestCase):
    """
    Test bootstrapping postgres database
    """
    def setUp(self):
        with open('tests/testdoc.html') as f:
            html_string = f.read()

        self.parser = CopyDoc(html_string, TOKENS)
        self.contents = self.parser.soup.body.contents

    def test_num_lines(self):
        self.assertEqual(len(self.contents), 19)

    def test_h1(self):
        self._is_tag(self.contents[0], 'h1')

    def test_h1_has_no_children(self):
        child_length = len(self.contents[0].find_all())
        self.assertEqual(child_length, 0)

    def test_h2(self):
        self._is_tag(self.contents[1], 'h2')

    def test_h3(self):
        self._is_tag(self.contents[2], 'h3')

    def test_p(self):
        self._is_tag(self.contents[3], 'p')

    def test_strong(self):
        self._contains_tag(self.contents[4], 'strong')

    def test_em(self):
        self._contains_tag(self.contents[5], 'em')

    def test_u(self):
        self._contains_tag(self.contents[6], 'u')

    def test_ignore_html(self):
        self._contains_tag(self.contents[7], 'strong', 0)

    def test_a(self):
        self._contains_tag(self.contents[8], 'a')

    def test_a_count(self):
        tags = self.parser.soup.body.findAll('a')
        self.assertEqual(len(tags), 2)

    def test_ahref(self):
        href = self.contents[8].a.attrs['href'][0]
        self.assertEqual(href, 'http://npr.org')

    def test_ul(self):
        self._is_tag(self.contents[9], 'ul')

    def test_ul_li(self):
        count_li = len(self.contents[9].find_all('li'))
        self.assertEqual(count_li, 3)

    def test_ol(self):
        self._is_tag(self.contents[10], 'ol')

    def test_ol_li(self):
        count_li = len(self.contents[10].find_all('li'))
        self.assertEqual(count_li, 3)

    def test_img(self):
        self._contains_tag(self.contents[11], 'img')

    def test_strange_has_no_children(self):
        child_length = len(self.contents[12].find_all())
        self.assertEqual(child_length, 0)

    def test_strange_has_extra_space_bug(self):
        clean_string = self.parser.clean_linebreaks(self.contents[12])
        expected_string = '<p>Strange formatting</p>'
        self.assertEqual(clean_string, expected_string)

    def test_tabletag(self):
        self._is_tag(self.contents[13], 'table')

    def test_tabletd(self):
        self._contains_tag(self.contents[13], 'td', 4)

    def test_tabletr(self):
        self._contains_tag(self.contents[13], 'tr', 2)

    def test_anchortag_combination(self):
        self._contains_tag(self.contents[15], 'a')

    def test_headline_extraction(self):
        self.assertEqual(self.parser.headline, 'this is a headline')

    def test_subhed_extraction(self):
        self.assertEqual(self.parser.subhed, 'this is a subhed')

    def test_banner_extraction(self):
        self.assertEqual(self.parser.banner, 'this is a banner')

    def test_image_extraction(self):
        self.assertEqual(
            self.parser.image,
            'http://media.npr.org/assets/img/2015/12/29/gettyimages-477258926_wide-s700-c85.jpg'
        )

    def test_mobile_image_extraction(self):
        self.assertEqual(
            self.parser.mobile_image,
            'https://media.giphy.com/media/3oEdv5FXteGY8iS8CY/giphy.gif')

    def test_audio_url_extraction(self):
        self.assertEqual(
            self.parser.audio_url,
            'http://play.podtrac.com/npr-510310/npr.mc.tritondigital.com/NPR_510310/media/anon.npr-mp3/npr/nprpolitics/2016/02/20160205_nprpolitics_roundup.mp3?orgId=1&d=2261&p=510310&story=465741966&t=podcast&e=465741966&ft=pod&f=510310'
        )

    def test_credit_extraction(self):
        self.assertEqual(self.parser.credit, 'this is a photo credit')

    def test_mobile_credit_extraction(self):
        self.assertEqual(self.parser.mobile_credit,
                         'this is a mobile photo credit')

    def test_iframe_markup(self):
        self.assertTrue(
            '<iframe width="560" height="315" src="https://www.youtube.com/embed/659pppwniXA" frameborder="0" allowfullscreen></iframe>'
            in self.parser.__unicode__())

    def test_nbsp_markup(self):
        self.assertTrue('This is a paragraph with a non-breaking&nbsp;space.'
                        in self.parser.__unicode__())

    def spaces_stripped(self):
        clean_string = self.parser.clean_linebreaks(self.contents[17])
        expected_string = '<p>This is a paragraph with multiple spaces.</p>'
        self.assertEqual(child_length, 0)

    def _is_tag(self, tag, tag_name):
        self.assertEqual(tag.name, tag_name)

    def _contains_tag(self, tag, tag_name, count=1):
        child_length = len(tag.findAll(tag_name))
        self.assertEqual(child_length, count)
Beispiel #13
0
    def setUp(self):
        with open('tests/transcript_with_embed.html') as f:
            html_string = f.read()

        self.parser = CopyDoc(html_string, TOKENS)
        self.body = self.parser.soup.body
Beispiel #14
0
    def setUp(self):
        with open('tests/link_italic.html') as f:
            html_string = f.read()

        self.parser = CopyDoc(html_string, TOKENS)
        self.body = self.parser.soup.body