Ejemplo n.º 1
def main(args):
    """ Example usage:
        Describe what we do in this file, then give an example of a command you
        might run on the command line.
        $ python parse-dailynews.py
    fn = 'dailynews.new'
    fh = open(fn, 'rb')
    markup = fh.read()
    if markup[:3] != '   ':
        # We have a gunzip'ed file we have to extract.
        # We know this because the first three characters of www.nydailynews.com urls are always '   '.
        # Always. They are always '   '.
        markup = gzip.GzipFile(fn, 'r').read()

    # Results of this parsing is stored in p.content
    regexes = {
        '<header\ id="rh">.*<div\ id="header-container"\ data-reg-role="header-container"></div>\ </header>',
        'footer': '<footer\ id="rf">.*</footer>'
    p = Parse()
    p.regexes = regexes
    p.regex = 'header'
    p.regex = 'footer'

    # Turn the nav markup into actionable javascript
    fh = open('html/template-dailynews.js', 'rb')
    js = fh.read()
    # When life was simple:
    #js = js.replace('{{header}}', " ".join(p.content['header'].replace("\n", "\\n").replace("'", "\\'").replace('/', '\/').splitlines()))
    # Life now:
    js = js.replace(
        " ".join(p.content['header'].replace(
            'href="/', 'href="http://www.nydailynews.com/').replace(
                "//www.nydaily").replace("'", "\\'").replace(
                    '/', '\/').replace("\n", "\\n").replace(
                        'join("\\\\n")').replace('/\\n+$', '/\\\\n+$').replace(
                            'rh-app.jpg"', 'rh-app.jpg" alt=""').replace(
                                'rh-subscribe.jpg" alt=""').replace(
                                    'notification.png" alt=""').
    js = js.replace(
        '{{footer}}', p.content['footer'].replace(
            'href="/', 'href="http://www.nydailynews.com/').replace(
                'article_750', 'article_250').replace(
                    "http://assets.nydaily", "https://www.nydaily").replace(
                        "\\'").replace('/', '\/').replace("\n", "\\n").replace(
                            'join("\\n")', 'join("\\\\n")').replace(
                                '/\\n+$', '/\\\\n+$').replace(
                                    '7.2945742 -->   <style>\n',
                                    '7.2945742 -->   <style>').replace(
                                        '\r', ''))

    fh = open('html/head.html', 'rb')
    head_markup = fh.read()

    # Write the file
    if p.content['header'] != '':
        f = FileWrapper('output/header.html')
        f = FileWrapper('output/header-iframeable.html')
        f.write('%s%s' % (head_markup, p.content['header']))
    if p.content['footer'] != '':
        f = FileWrapper('output/footer.html')
        f = FileWrapper('output/footer-iframeable.html')
        f.write('%s%s' % (head_markup, p.content['footer']))
    if p.content['footer'] != '' and p.content['header'] != '':
        f = FileWrapper('output/vendor-include.js')