Python WriteCleanTextの例、miscfuncs.WriteCleanText Pythonの例

コード例 #1

0

ファイルを表示

ファイル: pullgluepages.py プロジェクト: jargonautical/parlparse

def GlueByContents(fout, url_contents, regmemdate):
    ur = urllib.urlopen(url_contents)
    sr = ur.read()
    ur.close()

    soup = BeautifulSoup.BeautifulSoup(sr)
    mps = soup.find('a', attrs={'name': 'A'}).parent.findNextSiblings('p')
    for p in mps:
        url = urlparse.urljoin(url_contents, p.a['href'])
        #print " reading " + url
        ur = urllib.urlopen(url)
        sr = ur.read()
        ur.close()

        if ur.code == 404:
            print "failed to fetch %s - skipping" % url
            continue

        # write the marker telling us which page this comes from
        lt = time.gmtime()
        fout.write('<page url="%s" scrapedate="%s" scrapetime="%s"/>\n' % \
            (url, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt)))

        sr = re.sub('<p([^>]*)/>', r'<p\1></p>', sr)
        soup_mp = BeautifulSoup.BeautifulSoup(sr)
        page = soup_mp.find('h1').findNextSiblings(lambda t: t.name != 'div')
        page = '\n'.join([str(p) for p in page]) + '\n'
        miscfuncs.WriteCleanText(fout, page)

コード例 #2

0

ファイルを表示

def GlueByNext(fout, url, regmemdate):
    # loop which scrapes through all the pages following the nextlinks
    starttablewritten = False
    sections = 0
    while 1:
        print " reading " + url
        ur = urllib.urlopen(url)
        sr = ur.read()
        ur.close()

        sections += 1

        # write the marker telling us which page this comes from
        lt = time.gmtime()
        fout.write('<page url="%s" scrapedate="%s" scrapetime="%s"/>\n' % \
(url, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt)))

        # split by sections
        hrsections = re.split(
            '<a name="top"></a>|' + '<!-- end of variable data -->|' + '(?i)',
            sr)

        # write the body of the text
        #		for i in range(0,len(hrsections)):
        #                        print "------"
        #                        print hrsections[i]
        text = hrsections[1]
        m = re.search('<TABLE .*?>([\s\S]*)</TABLE>', text)
        if m:
            text = m.group(1)
        m = re.search('<TABLE .*?>([\s\S]*)', text)
        if m:
            text = m.group(1)
        if not starttablewritten and re.search('COLSPAN=4', text):
            text = "<TABLE>\n" + text
            starttablewritten = True
        miscfuncs.WriteCleanText(fout, text)

        # find the lead on with the footer
        footer = hrsections[2]

        nextsectionlink = re.findall(
            '<a href="([^>]*?)"><img border=0\s+align=top src="/pa/img/conu(?:grn|drd).gif" alt="continue"></a>',
            footer)
        if not nextsectionlink:
            break
        if len(nextsectionlink) > 1:
            raise Exception, "More than one Next Section!!!"
        url = urlparse.urljoin(url, nextsectionlink[0])

    # you evidently didn't find any links
    assert sections > 1

    fout.write('</TABLE>')

コード例 #3

0

ファイルを表示

ファイル: pullgluepages.py プロジェクト: samknight/parlparse

def GlueByNext(fout, urla, urlx, sdate):
    # put out the indexlink for comparison with the hansardindex file
    lt = time.gmtime()
    fout.write('<pagex url="%s" scrapedate="%s" scrapetime="%s"/>\n' % \
      (urlx, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt)))

    if urla[0] == 'http://www.publications.parliament.uk/pa/ld200607/ldhansrd/text/61130-0001.htm':
        urla = [urla[0]]
    if urla[0] == 'http://www.publications.parliament.uk/pa/ld200607/ldhansrd/text/70125-0001.htm':
        urla = urla[2:]
    if urla[0] == 'http://www.publications.parliament.uk/pa/ld200506/ldhansrd/vo050517/text/50517-02.htm':
        urla.insert(
            0,
            'http://www.publications.parliament.uk/pa/ld200506/ldhansrd/vo050517/text/50517-01.htm'
        )
    if urla[0] == 'http://www.publications.parliament.uk/pa/ld200405/ldhansrd/vo041123/text/41123-02.htm':
        urla.insert(
            0,
            'http://www.publications.parliament.uk/pa/ld200405/ldhansrd/vo041123/text/41123-01.htm'
        )
    if urla[0] == 'http://www.publications.parliament.uk/pa/ld200708/ldhansrd/text/80722-0001.htm':
        urla = [urla[0]]
    if urla[0] == 'http://www.publications.parliament.uk/pa/ld200708/ldhansrd/text/81104-0001.htm':
        urla = [urla[0]]
    if urla[0] == 'http://www.publications.parliament.uk/pa/ld201011/ldhansrd/text/110119-0001.htm':
        urla = [urla[0]]
# Missing header/footer, need to be able to find 2nd HTML page
    if urla[0] == 'http://www.publications.parliament.uk/pa/ld201213/ldhansrd/text/130327-0001.htm':
        urla.insert(
            1,
            'http://www.publications.parliament.uk/pa/ld201213/ldhansrd/text/130327-0002.htm'
        )

    # loop which scrapes through all the pages following the nextlinks
    # knocking off the known links as we go in case a "next page" is missing.
    while urla:
        url = urla[0]
        ur = urllib.urlopen(url)
        sr = ur.read()
        ur.close()

        # write the marker telling us which page this comes from
        fout.write('<page url="' + url + '"/>\n')

        # To cope with post 2006-07-03, turn <body> into <hr>
        sr = re.sub('<body><notus', '<body><hr><notus', sr)
        #sr = re.sub('<body><br>', '<body><hr><br>', sr)
        sr = re.sub('<body><h3 align="center"', '<body><hr><h3 align="center"',
                    sr)
        sr = re.sub('<body><p>', '<body><hr><p>', sr)

        # post 2006-09
        sr = re.sub("</?mekonParaReplace[^>]*>", "", sr)
        sr = re.sub("</?mekonHrefReplace[^>]*>", "", sr)
        sr = re.sub("<meta[^>]*>", "", sr)
        sr = re.sub('<a name="([^"]*)" />', r'<a name="\1"></a>',
                    sr)  # Should be WriteCleanText like for Commons?
        sr = re.sub('(<a href="[^"]*&amp)(">.*?)(</a>)(;.*?)([ .,<])',
                    r'\1\4\2\4\3\5', sr)
        sr = re.sub('<div id="maincontent1">\s+<notus', '<hr> <notus', sr)
        sr = re.sub('<div id="maincontent1">\s*<link[^>]*>\s*<notus',
                    '<hr> <notus', sr)  # New 2008-10...
        sr = re.sub('<div id="maincontent1">\s*<link[^>]*>\s*<h1', '<hr> <h1',
                    sr)  # New 2011-01...
        sr = re.sub('<div id="maincontent">(?:\s*<table.*?</table>)?(?s)', '',
                    sr)
        if url in (
                'http://www.publications.parliament.uk/pa/ld200607/ldhansrd/text/71001w0001.htm',
                'http://www.publications.parliament.uk/pa/ld201011/ldhansrd/text/110118-0001.htm',
        ):
            sr = re.sub('Daily Hansard</span></div>',
                        'Daily Hansard</span></div> <hr>', sr)
# To deal with missing header/footer on this day. Might need removing if they come back?
        if url == 'http://www.publications.parliament.uk/pa/ld201213/ldhansrd/text/121105-wms0001.htm' or re.match(
                'http://www.publications.parliament.uk/pa/ld201213/ldhansrd/text/130327',
                url):
            sr = re.sub('<body>', '<body> <hr>', sr)

# For 2013-02-26, 2013-05-08, so far
        sr = re.sub('<div id="content-small"><!--end',
                    '<div id="content-small"> <hr><!--end', sr)

        # 2012 summer recess hack
        #if re.match('http://www.publications.parliament.uk/pa/ld/ldtoday/writtens/..0[78]2012\.htm$', url):
        #        sr = sr.replace('<div class="hansardContent">', '<hr><a name="column_WA0">').replace('<hr/>', '<hr>')

        # post 2008-03, stupid duplication of <b>s
        sr = re.sub('<b>((?:<a name="[^"]*"></a>)*)<b>', '\\1<b>', sr)
        sr = re.sub('</b><!--[^>]*--></b>', '</b>', sr)

        # split by sections
        hrsections = re.split('<hr[^>]*>(?i)', sr)

        # this is the case for debates on 2003-03-13 page 30
        # http://www.publications.parliament.uk/pa/cm200203/cmhansrd/vo030313/debtext/30313-32.htm
        if len(hrsections) == 1:
            # special case for the Grand committee proceedings on 2011-03-23
            if url == 'http://www.publications.parliament.uk/pa/ld201011/ldhansrd/text/110323-gc0001.htm':
                miscfuncs.WriteCleanText(fout, sr, False)
            else:
                # print len(hrsections), 'page missing', url
                # fout.write('<UL><UL><UL></UL></UL></UL>\n')
                print "Bridging the empty page at %s" % url
            urla = urla[1:]
            continue

# Lords Written Statements on 2006-07-05, for example, sadly
        if len(hrsections) == 2:
            miscfuncs.WriteCleanText(fout, hrsections[1], False)

        # write the body of the text
        for i in range(1, len(hrsections) - 1):
            miscfuncs.WriteCleanText(fout, hrsections[i], False)

        # find the lead on with the footer
        footer = hrsections[-1]

        # the files are sectioned by the <hr> tag into header, body and footer.
        nextsectionlink = re.findall(
            '<\s*a\s+href\s*=\s*"?(.*?)"?\s*>next section</a>(?i)', footer)
        if len(nextsectionlink) > 1:
            raise Exception, "More than one Next Section!!!"
        if not nextsectionlink:
            urla = urla[1:]
            if urla:
                print "Bridging the missing next section link at %s" % url
        else:
            url = urlparse.urljoin(url, nextsectionlink[0])
            # Specific case on 2011-02-23
            if url == 'http://www.publications.parliament.uk/pa/ld201011/ldhansrd/text/110323-wms0001.htm':
                url = 'http://www.publications.parliament.uk/pa/ld201011/ldhansrd/text/110323-gc0001.htm'
            # this link is known
            if (len(urla) > 1) and (urla[1] == url):
                urla = urla[1:]
            # unknown link, either there's a gap in the urla's or a mistake.
            else:
                for uo in urla:
                    if uo == url:
                        print string.join(urla, "\n")
                        print "\n\n"
                        print url
                        print "\n\n"
                        raise Exception, "Next Section misses out the urla list"
                urla[0] = url

    pass  #endwhile urla

コード例 #4

0

ファイルを表示

ファイル: pullgluepages.py プロジェクト: jargonautical/parlparse

def GlueByNext(fout, url, regmemdate):
    # loop which scrapes through all the pages following the nextlinks
    starttablewritten = False
    matcheddate = False
    if re.search("ldreg", url):
        matcheddate = True
    sections = 0
    while 1:
        #print " reading " + url
        ur = urllib.urlopen(url)
        sr = ur.read()
        ur.close()

        sections += 1

        # check date
        if not matcheddate:
            dateinpage = re.search("current as at\s*<[bB]>(.*)</[bB]>", sr)
            if not dateinpage:
                raise Exception, 'Not found date marker'
            dateinpage = dateinpage.group(1).replace("&nbsp;", " ")
            dateinpage = mx.DateTime.DateTimeFrom(dateinpage).date
            if dateinpage != regmemdate:
                raise Exception, 'Date in page is %s, expected %s - update the URL list in regmempullgluepages.py' % (
                    dateinpage, regmemdate)
            matcheddate = True

        # write the marker telling us which page this comes from
        lt = time.gmtime()
        fout.write('<page url="%s" scrapedate="%s" scrapetime="%s"/>\n' % \
            (url, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt)))

        # split by sections
        hrsections = re.split(
            '<TABLE border=0 width="90%">|' +
            '</TABLE>\s*?<!-- end of variable data -->|' +
            '<!-- end of variable data -->\s*</TABLE>' + '(?i)', sr)

        # write the body of the text
        #       for i in range(0,len(hrsections)):
        #           print "------"
        #           print hrsections[i]
        text = hrsections[2]
        m = re.search('<TABLE .*?>([\s\S]*)</TABLE>', text)
        if m:
            text = m.group(1)
        m = re.search('<TABLE .*?>([\s\S]*)', text)
        if m:
            text = m.group(1)
        if not starttablewritten and re.search('COLSPAN=4', text):
            text = "<TABLE>\n" + text
            starttablewritten = True
        miscfuncs.WriteCleanText(fout, text)

        # find the lead on with the footer
        footer = hrsections[3]

        nextsectionlink = re.findall(
            '<A href="([^>]*?)"><IMG border=0\s+align=top src="/pa/img/next(?:grn|drd).gif" ALT="next page"></A>',
            footer)
        if not nextsectionlink:
            break
        if len(nextsectionlink) > 1:
            raise Exception, "More than one Next Section!!!"
        url = urlparse.urljoin(url, nextsectionlink[0])

    # you evidently didn't find any links
    assert sections > 10

    fout.write('</TABLE>')

コード例 #5

0

ファイルを表示

ファイル: pullgluepages.py プロジェクト: samknight/parlparse

def GlueByNext(fout, urlx, billtitle):
    # put out the indexlink for comparison with the hansardindex file
    lt = time.gmtime()
    fout.write('<pagex url="%s" scrapedate="%s" scrapetime="%s" billtitle="%s"/>\n' % \
      (urlx, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt), billtitle))
    url = urlx

    year = int(re.search('cm(\d{4})', urlx).group(1))
    if year >= 2010:
        pageheader = '<div id="content"'
        pagefooter = '<a name="end"/>'
    else:
        pageheader = '<img\s*src="/pa/img/portsgrn.gif"\s*alt="House\s*of\s*Commons\s*portcullis"><BR>'
        # there are various green button gifs, including two which say "continue", but with different filenames
        pagefooter = '<a href\s*=\s*"[^"]*">\s*<img border=0(?: align=top)? src="/pa/img/(?:ctntgrn|conugrn|prevgrn|contgrn).gif"'
    if re.search("/pa/cm200203/cmstand/d/st030401/am/30401s01.htm$", urlx):
        pageheader = "<!--end of UK Parliament banner for Publications -->"
    if re.search("/pa/cm200102/cmstand/d/st020115/am/20115s01.htm$", urlx):
        pageheader = "<!--end of UK Parliament banner for Publications -->"
    if re.search("/pa/cm200304/cmstand/c/st040428/pm/40428s01.htm$", urlx):
        pageheader = "<!--end of UK Parliament banner for Publications-->"
    if re.search("/pa/cm200203/cmstand/c/st030402/30402s01.htm$", urlx):
        pageheader = "<!--end of UK Parliament banner for Publications-->"
    if re.search("/pa/cm200102/cmstand/g/st020213/am/20213s01.htm$", urlx):
        pageheader = "<!--end of UK Parliament banner for Publications-->"
    if re.search("/pa/cm199900/cmstand/f/st000525/00525s10.htm#pm$", urlx):
        pageheader = "<a name=pm>"
        url = re.sub("#pm", "", url)
    if re.search("/pa/cm200910/cmpublic/bribery/100323/am", urlx):
        pageheader = '<div id="maincontent">'
        pagefooter = '<a name="end"/>'
    if re.search("/pa/cm200910/cmpublic/cooperativeandcommunity/100303/am",
                 urlx):
        pagefooter = '<a name="end">'
    if re.search("/pa/cm200910/cmpublic/marriagewales/100224/pm", urlx):
        pagefooter = '<a name="end">'
    if re.search('/pa/cm200910/cmpublic/thirdparties/100316/am', urlx):
        pagefooter = '<a name="end">'
    if re.search("/pa/cm200910/cmpublic/gromarket/100330/am", urlx):
        pageheader = '<div id="maincontent">'
        pagefooter = '<a name="end"/>'

    # loop which scrapes through all the pages following the nextlinks
    # knocking off the known links as we go in case a "next page" is missing.
    while True:
        if re.search("/pa/cm199798/cmstand/b/st971106/am/71106s04.htm$", url):
            url = re.sub("s04.htm", "s05.htm", url)  # skip over missing page

        ur = urllib.urlopen(url)
        sr = ur.read()
        ur.close()

        # write the marker telling us which page this comes from
        fout.write('<page url="' + url + '"/>\n')

        repagebody = '(?si).*?%s(.*?)%s' % (pageheader, pagefooter)
        mbody = re.match(repagebody, sr)
        if not mbody:
            if re.search("/pa/cm199899/cmstand/e/st990429/am/90429s03.htm$",
                         url):  # continuation does not exist
                break
            if re.search(
                    "/pa/cm199899/cmstand/special/st990420/pm/pt3/90420s12.htm$",
                    url):  # continuation does not exist
                break
            if re.search("/pa/cm200203/cmstand/d/st031016/pm/31016s06.htm$",
                         url):  # continuation does not exist
                break

            print "\n", pageheader, "\n\n", pagefooter, "\n\n"
            print "header", re.search('(?si)' + pageheader, sr)
            print "footer", re.search('(?si)' + pagefooter, sr)
            print url
            print sr[:2000]
            assert False

        miscfuncs.WriteCleanText(fout, mbody.group(1), False)
        # the files are sectioned by the <hr> tag into header, body and footer.
        mnextsectionlink = re.search(
            '(?si)<\s*a\s+href\s*=\s*"?([^"]*?)"?\s*>\s*<img border=0 align=top src="/pa/img/conugrn.gif"',
            sr[mbody.end(1):])
        #print "   nextsectionlink", mnextsectionlink
        if not mnextsectionlink:
            break
        url = urlparse.urljoin(url, mnextsectionlink.group(1))
        if miscfuncs.IsNotQuiet():
            print "  ", re.sub(".*?cmstand/", "", url)

        # second and subsequent pages
        pageheader = '<p align=right>\[<a href="[^"]*">back to previous text</a>\]'

    pass  #endwhile urla