Exemple #1
0
def rp_table():
    url = "http://www.atoc.org/clientfiles/File/routeing_points.pdf"

    print "Processing routeing point list"
    pdfdata = urllib2.urlopen(url).read()
    print "The pdf file has %d bytes" % len(pdfdata)

    print "Converting to XML"
    xmldata = scraperwiki.pdftoxml(pdfdata)

    #print "After converting to xml it has %d bytes" % len(xmldata)
    #print "The first 20000 characters are: ", xmldata[:20000]

    scraperwiki.sqlite.execute('DROP TABLE IF EXISTS rp_maps')
    scraperwiki.sqlite.execute('CREATE TABLE rp_maps (routeing_point, mapname)')

    print "Processing XML"
    # This is horrid, and assumes that the PDF will be in the correct order.
    for _, cell in lxml.etree.iterparse(StringIO.StringIO(xmldata), tag='text'):
        if int(cell.attrib['top']) > 100:
            if cell.attrib['left'] == '38':
                rp = cell.xpath('string()').title()
            else:
                for mapname in cell.xpath('string()').split():
                    scraperwiki.sqlite.execute('INSERT INTO rp_maps VALUES (?, ?)',
                        (rp, mapname))
        cell.clear()

    print "Creating indexes"
    scraperwiki.sqlite.execute('CREATE INDEX maps_byrp ON rp_maps(routeing_point)')
    scraperwiki.sqlite.execute('CREATE INDEX rps_bymap ON rp_maps(mapname)')
    print "Committing"
    scraperwiki.sqlite.commit()
    print "Routeing point list processed"
def do_it(pdfurl):
    '''
    Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents
    as a styled HTML div.
    '''

    pdfdata = urllib2.urlopen(pdfurl).read()
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    root = lxml.etree.fromstring(pdfxml)

    # just turn it into plain text
    raw = ''
    for index, page in enumerate(root):
        for text in page:
            raw += ' '.join(text.xpath("descendant-or-self::text()"))
            raw += "\n"

    # pull out the grades
    data = {}
    grades = extract_grades(raw)
    for i in range(1,8):
        data['key_question_grade_%d'%(i,)] = grades[i]
    data['date_of_inspection'] = extract_date(raw)

    return data
def getpages(href):
    pdfdata = scraperwiki.scrape(href)
    xml = scraperwiki.pdftoxml(pdfdata)
    dom = lxml.etree.fromstring(xml)
    pages = list(dom)
    print "The pages are numbered:", [page.attrib.get("number") for page in pages]
    return pages
def getpdfs():
    html = parse('http://www.safaricom.co.ke/index.php?id=275').getroot()
    html.make_links_absolute()
    pdf_urls = html.xpath('//table[@class="contenttable" and @width="540"]/descendant::a/@href')
    
    for url in pdf_urls:
        save(['date_scraped', 'url'], {"date_scraped": DATE, "url": url, "pdfxml": pdftoxml(urlopen(url).read())}, 'pdfs')
Exemple #5
0
def pdfGrabber(typ):
    #if src =='f1mediacentre': url = "http://www.fia.com/en-GB/mediacentre/f1_media/Documents/"+race+"-"+typ+".pdf"
    #http://184.106.145.74/fia-f1/f1-2012/f1-2012-08/eur-f1-2012-fp1-times.pdf
    #http://184.106.145.74/fia-f1/f1-2012/f1-2012-08/eur-f1-2012-fp1-classification.pdf
    ##trying http://184.106.145.74/fia-f1/f1-2012/f1-2012-08/eur-fp1-classification.pdf
    rnum='08'
    typ2=typ.replace('session','fp')
    if src =='f1mediacentre': url = "http://184.106.145.74/fia-f1/f1-2012/f1-2012-"+rnum+"/"+race+"-f1-2012-"+typ2+".pdf"
    else: url="http://dl.dropbox.com/u/1156404/"+race+"-"+typ+".pdf"
    #url='http://dl.dropbox.com/u/1156404/mal-race-analysis.pdf'
    pdfdata = urllib2.urlopen(url).read()
    print "The pdf file has %d bytes" % len(pdfdata)

    xmldata = scraperwiki.pdftoxml(pdfdata)
    '''
    print "After converting to xml it has %d bytes" % len(xmldata)
    print "The first 2000 characters are: ", xmldata[:2000]
    '''

    root = lxml.etree.fromstring(xmldata)
    
    pages = list(root)
    #print 'pre',pages
    print "The pages are numbered:", [ page.attrib.get("number")  for page in pages ]
    return pages
Exemple #6
0
def parseReport(pdfurl,urn):
    '''
    Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents
    as a styled HTML div.
    '''
    try:
        pdfdata = urllib2.urlopen(pdfurl).read()
        if pdfdata == '':
            return "Failed to load/PDF does not exist"
        pdfxml = scraperwiki.pdftoxml(pdfdata)
        root = lxml.etree.fromstring(pdfxml)
        reportdata = []
        #print "URN %s URL %" % (pdfurl,urn)
    
        # Print each page of the PDF.
        for index, page in enumerate(root):
            data = PageSave(page, index,urn)
            reportdata.append(data)
            for ldata in data:
                #print data
                lldata = ldata.copy()
                lldata["urm"] = urn
                scraperwiki.sqlite.save(unique_keys=ldata.keys(), data=lldata, table_name="other")
        #print reportdata
        report = {'urn':urn, 'data':reportdata}
        print report
        scraperwiki.sqlite.save(unique_keys=["urn"], data=report)
        return "Success"
    except Exception, e:
        return "Error %s" % e
def Main(pdfurl):
    '''
    Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents
    as a styled HTML div. 
    '''
    pdfdata = urllib2.urlopen(pdfurl).read()
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    root = lxml.etree.fromstring(pdfxml)

    global styles
    fontspecs = { }

    # Get the PDF's internal styles: we'll use these to style the divs containing the PDF.
    for fontspec in root.xpath('page/fontspec'):
        id = fontspec.attrib.get('id')
        fontdesc = {'size':int(fontspec.attrib.get('size')), 'family':fontspec.attrib.get('family'), 'color':fontspec.attrib.get('color')}
        fontspecs[id] = fontdesc
        styles['div.fontspec-%s' % id] = 'color:%s;font-family:%s;font-size:%dpx' % (fontdesc['color'], fontdesc['family'], fontdesc['size'])

    # Output the view, with instructions for the user.
    print '<html dir="ltr" lang="en">'
    print '<head>'
    print '    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
    print '    <title>PDF to XML text positioning</title>'
    print '    <style type="text/css" media="screen">%s</style>' % "\n".join([ "%s { %s }" % (k, v)  for k, v in styles.items() ])
    print '    <script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script>'
    print '    <script>%s</script>' % jscript
    print '</head>'

    # Print each page of the PDF.
    for index, page in enumerate(root):
        print Pageblock(page, index)
    def preprocess(self, pdfurl, pdfcontent):
        print "Preprocessing PDF " + pdfurl
        if not pdfcontent:
            raise ValueError("No pdf content passed for " + pdfurl)
        if self.hiddentext:
            options = '-hidden'
        else:
            options = ''
        xml=scraperwiki.pdftoxml(pdfcontent, options)
        if self.debug:
            print xml
        pages=re.findall('(<page .+?</page>)',xml,flags=re.DOTALL)
        xml=None
#    print pages[:1][:1000]
        pagecount = 0
        datastore = []
        for page in pages:
            pagecount = pagecount + 1
            self.is_valid_page(pdfurl, pagecount, page)
            data = {
                'scrapedurl' : pdfurl,
                'pagenum' : pagecount,
                'pagecontent' : page,
            }
            datastore.append(data)
        if 0 < len(datastore):
            scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=datastore, table_name=self.pagetable)
        else:
            raise ValueError("Unable to find any pages in " + pdfurl)
        pages = None
def scrapereport(reportlink):
    boldline = 0
    html = scraperwiki.scrape(baseurl+reportlink)
    root = lxml.html.fromstring(html)
    links = root.cssselect("div#unusefulbottom a")
    #<div id="unusefulbottom">
    for link in links:
        print "LINK GRABBED WITH CSSSELECT", link
        print "link.attrib.get", link.attrib.get('href')
        downloadlink = link.attrib.get('href')
#    print " downloadlink[0].text_content()", downloadlink[0].text_content()
        pdfdata = urllib2.urlopen(baseurl+downloadlink).read()
        print "pdfdata", pdfdata
        xmldata = scraperwiki.pdftoxml(pdfdata)
        print "xmldata", xmldata
        pdfxml = lxml.etree.fromstring(xmldata)
        print "pdfxml", pdfxml
        boldtags = pdfxml.xpath('.//text')
        linenumber = 0
        for heading in boldtags:
            linenumber = linenumber+1
            #print "Heading:", heading.text
            if heading.text is not None:
#                mention = re.match(r'.*NMS.*',heading.text)
                mention = re.match(r'.*overall.*',heading.text)
                if mention:
                    print "FULL LINE", lxml.etree.tostring(heading, encoding="unicode", method="text")
#                    print "OVERALL", heading.text
#                    print "CHECK", pdfxml.xpath('.//text')[linenumber-1].text
#                    print "LINEAFTER", pdfxml.xpath('.//text')[linenumber].text
                    record['overall'] = lxml.etree.tostring(heading, encoding="unicode", method="text")
                    record['uniqueref'] = reportlink+"_"+str(boldline)
                    record['downloadlink'] = baseurl+downloadlink
                    scraperwiki.sqlite.save(['uniqueref'],record)
Exemple #10
0
def read_file_return_etree(uid):
    with open('cached_pdfs/{}.pdf'.format(uid), 'r') as f:
        pdfdata = f.read()                                    # str
    xmldata = scraperwiki.pdftoxml(pdfdata)                   # unicode
    xmldata = bytes(bytearray(xmldata, encoding='utf-8'))     # str
    element_tree = ET.fromstring(xmldata)
    return element_tree
def scrape_cieisp(year, text):
    if (year == 2010):
        pdfurl = "http://www.secretariadoejecutivosnsp.gob.mx/work/models/SecretariadoEjecutivo/Resource/131/1/images/INCIDENCIA_DELICTIVA_2010_030211.pdf"
    else:
        pdfurl = "http://www.secretariadoejecutivosnsp.gob.mx/work/models/SecretariadoEjecutivo/Resource/131/1/images/CIEISP" + `year` + ".pdf"
    a = scraperwiki.scrape(pdfurl)
    s = BeautifulSoup(scraperwiki.pdftoxml(a))

    dolosos_position = []
    i = 0
    for t in s.findAll('text'):
        if t.text == "DOLOSOS":
            if text == "POR ARMA DE FUEGO":
                dolosos_position.append(i+14)
            else:
                dolosos_position.append(i)
        i += 1

    all_text = s.findAll('text')
    #print all_text
    
    if (year <= 2008) :
        if (year >=2006):
            states_names = states3
        else:
            states_names = states2
    else:
        states_names = states

    for i in range(0,33):
        for j in range(1,14):
            record = {'State' : states_names[i], 'Year' : year, 'Month' : months[j-1], 'Homicides' : all_text[dolosos_position[i]+j].text, 'Crimetype' : text}
            scraperwiki.datastore.save(["State", "Year", "Month"], record)
    return
def getTablePages():
  url = "http://www.bnr.rw/docs/publicnotices/List%20of%20MFIs%20Update_Sept_%202011.pdf"
  pdfdata = urlopen(url).read()
  xmldata = pdftoxml(pdfdata)
  root = fromstring(xmldata)
  pages = list(root)
  return pages
Exemple #13
0
def pdfParser(pdfdata,path):
	txt=[]
	
	pdfdata = urllib2.urlopen(url).read()
	xmldata = scraperwiki.pdftoxml(pdfdata)
	root = lxml.etree.fromstring(xmldata)

	# 4. Have a peek at the XML (click the "more" link in the Console to preview it).
	#print lxml.etree.tostring(root, pretty_print=True)

	# 5. How many pages in the PDF document?
	pages = list(root)
	#print "There are",len(pages),"pages"

	# 6. Iterate through the elements in each page, and preview them
	for page in pages:
		for el in page:
			if el.tag == "text":
				#print el.text, el.attrib
				if el.text!=None: txt.append(el.text)


	try:
		ftxt=open(path+'/text.txt','w')
		ftxt.write("\n".join(txt).encode('utf-8'))
		ftxt.close()
	except: pass
def urltohtml(url="http://www.madingley.org/uploaded/Hansard_08.07.2010.pdf"):
    import scraperwiki, urllib2, lxml.etree
    lazycache=scraperwiki.swimport('lazycache')
    pdfdata = lazycache.lazycache(url)

    xmldata = scraperwiki.pdftoxml(pdfdata)
    root = lxml.etree.fromstring(xmldata)
    pages = list(root)
    
    # this function has to work recursively because we might have "<b>Part1 <i>part 2</i></b>"
    def gettext_with_bi_tags(el):
        res = [ ]
        if el.text:
            res.append(el.text)
        for lel in el:
            res.append("<%s>" % lel.tag)
            res.append(gettext_with_bi_tags(lel))
            res.append("</%s>" % lel.tag)
            if el.tail:
                res.append(el.tail)
        return "".join(res)
    
    # print the first hundred text elements from the first page
    text=[]
    for page in pages:
        for el in list(page)[:100]:
            if el.tag == "text":
                text.append(gettext_with_bi_tags(el))
    return '\n'.join(text)
def fetch_rows(url, x_threshold):
    points_rows = []
    districts_rows = []
    print url
    f = requests.get(url)
    pdf = scraperwiki.pdftoxml(f.content)
    root = lxml.etree.fromstring(pdf)
    pages = root.xpath("//page")
    for page in pages:
        page_number = int(page.xpath("./@number")[0])
        texts = page.xpath("./text")
        tag = 0
        for text in texts:
            x = int(text.xpath("./@left")[0])
            y = int(text.xpath("./@top")[0])
            if text.text is not None:
                
                s = re.sub(r'\d+\.', '', text.text).strip()
                m = re.match(r'\d+\.', text.text)
                if m is not None:
                    tag = int(m.group(0).strip()[0:-1])
                if len(s) == 0:
                    continue
                d = {'text': s, 'y': y, 'x': x, 'page': page_number, 'tag': tag}
                if x >= x_threshold:
                    points_rows.append(d)                   
                else:
                    districts_rows.append(d)
    return  (points_rows, districts_rows)
def parse_pdf(url):
    pdf_data = urllib2.urlopen(url).read()
    assert len(pdf_data) > 0
    
    xml_data = sw.pdftoxml(pdf_data)
    tree = etree.parse(StringIO(xml_data))
    root = tree.getroot()
    print root.xpath('//*text[left<200]')
def GetPDFtrans():
    pdfurl = "http://www.birmingham.gov.uk/cs/Satellite?%26ssbinary=true&blobcol=urldata&blobheader=application%2Fpdf&blobheadername1=Content-Disposition&blobkey=id&blobtable=MungoBlobs&blobwhere=1223439077563&blobheadervalue1=attachment%3B+filename%3D444523Payments+over+%C2%A3500+August.pdf"
    c = urllib.urlopen(pdfurl).read()
    x = scraperwiki.pdftoxml(c)
    print x[:4000]
    urlup = "http://seagrass.goatchurch.org.uk/~julian/cgi-bin/uu.cgi"
    d = urllib.urlencode({"name":"brumpdf500xml", "contents":x})
    print urllib.urlopen(urlup, d).read()
def scrapeschool(url):
    print url
    html = scraperwiki.scrape(url)
    print html
    root = lxml.html.fromstring(html)
        #create an empty variable 'record', which is a dictionary
    record = {}
        #create a uniqueid that we'll add to with each record later
    uniqueid = 0
    record["school"] = root.cssselect("h1")[0].text_content()
    record["parentviewurl"] = root.xpath(".//div[@id='content']//a")[0].attrib.get('href')
#Expressed more simply, this could take up three lines like so:
#    parentviewurls = root.xpath(".//div[@id='content']//a")
#    parentviewurl = parentviewurls[0].attrib.get('href')
#    record["parentviewurl"] = parentviewurl
    record["URN"] = root.xpath(".//div[@id='content']//p//strong")[0].text_content()
    record["Address"] = lxml.etree.tostring(root.xpath(".//div[@id='content']//p")[1])
    report1url = root.xpath(".//table[@summary='Previous reports']//td//a")[0].attrib.get('href')
    record["report1url"] = report1url
#    record["inspectiondate"] = root.xpath(".//table[@summary='Previous reports']//td")[1].text_content
    uniqueid =+ 1
    record["uniqueid"] = uniqueid
    print record
#use the urllib2 library's .urlopen function to open the full PDF URL, and the .read() function to read it into a new object, 'pdfdata'
    pdfdata = urllib2.urlopen(baseurl+report1url).read()
#use pdftoxml to convert that into an xml document
    pdfread = scraperwiki.pdftoxml(pdfdata)
    print pdfread
#use lxml.etree to convert that into an lxml object
    pdfroot = lxml.etree.fromstring(pdfread)
    leadership = re.search(r'b>The quality of .* <b',pdfread)
    if leadership:
#        print linenumber
        print leadership.group()
#find all <b> tagged lines - headings?
    lines = pdfroot.findall('.//text')
    linenumber = 0
    for line in lines:
        linenumber = linenumber+1
        if line.text:
            FSM = re.match(r'.* free school meals .*',line.text)
            if FSM:
                print linenumber
                print FSM.group()
#                if pdfroot.xpath('.//text')[linenumber-2].text:
                print pdfroot.xpath('.//text')[linenumber-2].text
                print pdfroot.xpath('.//text')[linenumber-1].text
                print pdfroot.xpath('.//text')[linenumber].text
#                if pdfroot.findall('.//text')[linenumber].text:
                record["FSM3"] = pdfroot.findall('.//text')[linenumber].text
                print record

#UP TO HERE. NEED TO:
#IDENTIFY THE LINE WE WANT - PERHAPS .XPATH AND (CONTAINS)
#GRAB X CHARACTERS AFTER THAT - OR:
#IDENTIFY THE INDEX POSITION OF THAT <TEXT><B> HEADING AND THE NEXT ONE AND GRAB ALL LINES BETWEEN

    scraperwiki.sqlite.save(["uniqueid"],record)
    def get_id_period (self, date):

        from_iso_dt, to_iso_dt = util.inc_dt(date.strftime(util.ISO8601_DATE), util.ISO8601_DATE, self.PERIOD_TYPE)
        from_dt = util.get_dt(from_iso_dt, util.ISO8601_DATE)
        to_dt = util.get_dt(to_iso_dt, util.ISO8601_DATE)

        url_date = to_dt.strftime(self.search_url1)
        if self.DEBUG: print url_date
        try:
            response = self.br.open(url_date)
        except:
            url_date = to_dt.strftime(self.search_url2)
            if self.DEBUG: print url_date
            try:
                response = self.br.open(url_date)
            except:
                url_date = to_dt.strftime(self.search_url3)
                if self.DEBUG: print url_date
                try:
                    response = self.br.open(url_date)
                except:
                    response = None

        final_result = []
        if response:
            html = response.read()
            if self.DEBUG: print html
            url = response.geturl()
            result = scrapemark.scrape(self.scrape_ids1, html, url)
            if not result or not result.get('records'):
                result = scrapemark.scrape(self.scrape_ids1a, html, url)
            if not result or not result.get('records'):
                result = scrapemark.scrape(self.scrape_ids2, html, url)
            if not result or not result.get('records'):
                pdfxml = scraperwiki.pdftoxml(html)
                if self.DEBUG: print pdfxml
                result = scrapemark.scrape(self.scrape_ids3, pdfxml, url)     
            if result and result.get('records'):
                for rec in result['records']:
                    rec['url'] = url_date
                    rec['date_received'] = to_iso_dt
                    rec['start_date'] = to_iso_dt
                    if rec.get('agent1'):
                        if rec.get('agent2'):
                            rec['agent_address'] = rec['agent1'] + ' ' + rec['agent2']
                        else:
                            rec['agent_address'] = rec['agent1']
                        del rec['agent1']
                        if 'agent2' in rec: del rec['agent2']
                self.clean_ids(result['records'])
                for rec in result['records']: # note do this after record cleaning
                    rec['date_scraped'] = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
                final_result.extend(result['records'])
        #else:
        #    return [], None, None

        return final_result, from_dt, to_dt 
 def __init__(self, filename):
     # load the pdf
     with open(filename) as f:
         pdf_string = f.read()
     # convert to xml
     xml_string = scraperwiki.pdftoxml(pdf_string)
     # parse xml
     self._xml = lxml.etree.fromstring(xml_string)
     self._pages = [self._page_to_blocks(page_num) for page_num in range(1, self.page_count() + 1)]
def ConvertPDFtoSqlite(docname, pdfurl):
    print "converting", docname, pdfurl
    pdfdata = urllib2.urlopen(pdfurl).read()
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    try:
        root = lxml.etree.fromstring(pdfxml)
    except lxml.etree.XMLSyntaxError, e:
        print "Bad xml file", str(e)
        print pdfxml[:19000]
        return
def Main(pdfurl):
    '''
    Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents
    as a styled HTML div. 
    '''
    pdfdata = urllib2.urlopen(pdfurl).read()
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    root = lxml.etree.fromstring(pdfxml)

    global styles
    fontspecs = { }

    # Get the PDF's internal styles: we'll use these to style the divs containing the PDF.
    for fontspec in root.xpath('page/fontspec'):
        id = fontspec.attrib.get('id')
        fontdesc = {'size':int(fontspec.attrib.get('size')), 'family':fontspec.attrib.get('family'), 'color':fontspec.attrib.get('color')}
        fontspecs[id] = fontdesc
        styles['div.fontspec-%s' % id] = 'color:%s;font-family:%s;font-size:%dpx' % (fontdesc['color'], fontdesc['family'], fontdesc['size'])

    # Output the view, with instructions for the user.
    print '<html dir="ltr" lang="en">'
    print '<head>'
    print '    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
    print '    <title>PDF to XML text positioning</title>'
    print '    <style type="text/css" media="screen">%s</style>' % "\n".join([ "%s { %s }" % (k, v)  for k, v in styles.items() ])
    print '    <script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script>'
    print '    <script>%s</script>' % jscript
    print '</head>'

    print '<div class="info" id="info1">&lt;text block&gt;</div>'
    print '<div class="info" id="info2">&lt;position&gt;</div>'

    print '<div class="heading">'
    print '<h2>Graphical preview of scraperwiki.pdftoxml(pdfdata)</h2>'

    print '<p>Click on a text line to see its coordinates and any other text that shares the same column or row.'
    print '   Useful for discovering what coordinates to use when extracting rows from tables in a document.</p>'
    print '<p>To do: track the coordinates of the mouse and cross reference with <a href="/cropper">cropper</a> technology.</p>'

    print '<p class="href"><a href="%s">%s</a></p>'% (pdfurl, pdfurl)
    print '<form id="newpdfdoclink">'
    print '    Another PDF link:'
    print '    <input type="text" name="url" value="" title="paste in url of new document">'
    print '    <input type="submit" value="Go">'
    print '</form>'
    ttx = re.sub('<', '&lt;', pdfxml)
    ttx = re.sub('\n', '\r\n', ttx) 
    print '<textarea class="pdfprev">%s</textarea>' % ttx[:5000]
    print '</div>'

    print '<p>There are %d pages</p>' % len(root)

    # Print each page of the PDF.
    for index, page in enumerate(root):
        print Pageblock(page, index)
def parsepdf(pdfurl):
    a = scraperwiki.scrape(pdfurl)
    s = BeautifulSoup(scraperwiki.pdftoxml(a))
    kobcine = {}
    for t in s.findAll('text'):
        if t.text != " ":
            ko_ime = find_ko(t.text)
            if ko_ime:
                ko = kobcine.setdefault(ko_ime, 0)
                kobcine[ko_ime] = ko + 1
    return kobcine
def paginating(url):
    pages = range(13, 276, 2)
    pdf_data = urllib2.urlopen(url).read()
    xml_data = sw.pdftoxml(pdf_data)
    html_data = html.fromstring(xml_data)
    for page in pages:
        page_data = html_data.cssselect('page')[page]
        
        print html.tostring(page_data)
        
        parse_pdf_header(page_data)
Exemple #25
0
def pink_pages():
    #
    # Pink pages
    #

    print "Loading pink pages (station to RP mapping)"
    url = "http://www.atoc.org/clientfiles/File/routeing_point_identifier.pdf"

    pdfdata = urllib2.urlopen(url).read()
    print "The pdf file has %d bytes" % len(pdfdata)
    print "Converting to XML"

    xmldata = scraperwiki.pdftoxml(pdfdata)
    # print "After converting to xml it has %d bytes" % len(xmldata)
    # print "The first 20000 characters are: ", xmldata[:20000]

    print "Parsing XML"
    root = lxml.etree.fromstring(xmldata)

    # Each station is on a single line consisting of the station name and then
    # the various routeing points.

    stncells = root.xpath('//text[@left=37]')

    scraperwiki.sqlite.execute('DROP TABLE IF EXISTS routeing_points')
    scraperwiki.sqlite.execute('CREATE TABLE routeing_points (station, routeing_point)')
    scraperwiki.sqlite.execute('DROP TABLE IF EXISTS groups')
    scraperwiki.sqlite.execute('CREATE TABLE groups (station, stngroup)')

    find_other_cells = lxml.etree.XPath('following-sibling::text[@top=$this/@top]')

    print "Extracting station list"
    for stncell in stncells:
        # Find other cells on the same row of the same page.
        othercells = find_other_cells(stncell, this = stncell)
        for othercell in othercells:
            stn, other = stncell.xpath('string()'), othercell.xpath('string()')
            stn = stn.title()
            if other == "Routeing Point":
                other = stn
            if other.endswith(" Routeing Point Member"):
                other = other[:-22]
                scraperwiki.sqlite.execute('INSERT INTO groups VALUES (?, ?)',
                    (str(stn), str(other)), verbose=0)
            scraperwiki.sqlite.execute('INSERT INTO routeing_points VALUES (?, ?)',
                (str(stn), str(other)), verbose=0)

    print "Creating indexes"
    scraperwiki.sqlite.execute('CREATE INDEX points_bystn ON routeing_points(station)')
    scraperwiki.sqlite.execute('CREATE INDEX groups_bystn ON groups(station)')
    print "Committing"
    scraperwiki.sqlite.commit()
    print "Pink pages processed"
def main():
    link_src = \
    '''https://api.scraperwiki.com/api/1.0/datastore/sqlite?format=jsondict&name=malaysian_parliament_hansard_url&query=select%20*%20from%20swdata%20limit%2010'''
    links = urllib2.urlopen(link_src)
    links_data = json.load(links)
    pdf_url = links_data[2]['url'].replace(' ','%20')
    print pdf_url
    pdf_data = urllib2.urlopen(pdf_url).read()
    xml_data = scraperwiki.pdftoxml(pdf_data)
    xml_data = xml_data.replace('<b>','').replace('</b>','')
    print xml_data
    root = etree.fromstring(xml_data)
def parse_pdf(url, name, page_url):
    url = url.encode('ascii')
    name = name.encode('utf-8')
    print name
    pdf_url = "http://www.has-sante.fr/portail/" + url
    avis = " "
    avis2 = " "
    
    #follows the first link
    a = scraperwiki.scrape(pdf_url)
    a = a.lower()    

    #finds the actual link (there's a redirect)
    soup = BeautifulSoup(a)
    
    pdf_url = soup.find("meta")

    pdf_url = pdf_url['content']
    pdf_url = pdf_url.replace("0; url='../../../../", "http://www.has-sante.fr/portail/")
    pdf_url = pdf_url[:-1]
    pdf_url = pdf_url.encode('ascii')
    
    #now for the real pdf
    try:
        b = scraperwiki.scrape(pdf_url)
        s = BeautifulSoup(scraperwiki.pdftoxml(b))
    
        #some basic regex to extract meaningful info
        for t in s.findAll('text'):
            if t.text != " ": 
                pattern = '^.*?int.r.t de sant. publique.*?faible.*?$'
                pattern2 = '^.*?service m.dical rendu par.*?$'
                if (re.search(pattern, t.text)):
                    avis = t.text
                    avis = avis.encode('utf-8')
                    print avis
                elif(re.search(pattern2, t.text)):
                    avis2 = t.text
                    avis2 = avis2.encode('utf-8')
                    print avis2
    
        #now we've got everything, we're adding it to the DB
        data = {}
        medoc_name = name
        data['Name'] = medoc_name
        data['pdf_url'] = pdf_url
        data['page_url'] = page_url
        data['interet_sante'] = avis + "\n" + avis2
        data[medoc_name] = medoc_name
        scraperwiki.datastore.save(['Name'], data)

    except: 
        print "Error" + pdf_url
def process_pdf(pdfurl):
# (harder example to work on: http://www.nihe.gov.uk/schemes_accepted_010109_to_310309.pdf )
    pdfdata = urllib.urlopen(pdfurl).read()
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    s = BeautifulSoup(pdfxml)
    entrylines = []
    for text in s.findAll('text'):
        #print text
        entrylines.append(text)
        left = int(text['left'])
        if 82 == left:
            periode = text.text
            innut, m, y = periode.split(" ")
        if 107 == left:
            kapittel = text.text
        if 163 == left:
            post = text.text
        if 704 <= left and left <= 782:
            overfraifjor = text.text
        if 822 <= left and left <= 867:
            bevilgning = text.text
        if 920 <= left and left <= 965:
            samlbevilgning = text.text
        if 1011 <= left and left <= 1056:
            regnskap = text.text
        if 1124 <= left and left <= 1156 and u"1000 kr" != text.text.strip() and post is not None:
            rest = text.text
            if overfraifjor is None or bevilgning is None or samlbevilgning is None or regnskap is None or rest is None:
                error(entrylines, kapittel, post, overfraifjor, bevilgning, samlbevilgning, regnskap, rest)
            data = {
                'periode' : periode,
                'year' : y,
                'month' : m,
                'type' : innut,
                'kapittel' : kapittel,
                'post' : post,
                'overfraifjor' : valstr2int(overfraifjor),
                'bevilgning' : valstr2int(bevilgning),
                'samlbevilgning' : valstr2int(samlbevilgning),
                'regnskap' : valstr2int(regnskap),
                'rest' : valstr2int(rest),
            }
            #print data
            #time.sleep(1)
            scraperwiki.sqlite.save(unique_keys=['periode', 'kapittel', 'post'], data=data)
            post = None
            overfraifjor = None
            bevilgning = None
            samlbevilgning = None
            regnskap = None
            rest = None
            entrylines = []
def scrape_and_look_for_next_link(url):
    html = scraperwiki.scrape(url)
    soup = BeautifulSoup(html)
    find_link = soup.findAll(href=re.compile("/downloads/pdf/crime_statistics/"))
    next_link = [None]*(len(find_link))
    rep_link = [None]*(len(find_link))
    for i in range(len(find_link)):
        next_link[i] = find_link[i]['href']
        rep_link[i] = next_link[i].replace('../..','http://www.nyc.gov/html/nypd')
    for i in range(len(rep_link)):
        a = scraperwiki.scrape(rep_link[i]) #here I call my previously defined function to convert and scrape the pdf
        soup_pdf = BeautifulSoup(scraperwiki.pdftoxml(a))
        scrape_table(soup_pdf)
Exemple #30
0
def extract_birth(cvfile):
    '''Extract birth year from PDF file with CV.'''

    pdf = open(CVFILE, 'rb')
    xml = scraperwiki.pdftoxml(pdf.read())

    root = lxml.etree.fromstring(xml)
    birthstr = root.xpath('//text[@top="320"]')[0].text
    
    mf = 'M' if birthstr[3] == 'o' else 'F'
    birthyear = birthstr[-4:]
    
    print(mf, birthyear)
Exemple #31
0
def pdf_scrape(pdf, directory):
    '''Convert pdf to xml'''

    with open("pdf/" + directory + "/" + pdf) as u:
        xml = pdftoxml(u.read())

    if not os.path.exists("xml"):
        os.mkdir("xml")

    with open("xml/" + pdf + ".xml", "w") as w:
        w.write(xml)

    return xml
Exemple #32
0
def main():
    url="http://governor.ny.gov/citizenconnects/assets/document/CitizenConnectsdoc.pdf"
    pdfdata = urllib2.urlopen(url).read()
    xmldata = scraperwiki.pdftoxml(pdfdata)

    rootdata=lxml.etree.fromstring(xmldata)
    pages = list(rootdata)

#    print "The pages are numbered:", [ page.attrib.get("number")  for page in pages ]

    for page in pages:
        entries = getText(page)
        store(entries)
Exemple #33
0
def Main(pdfurl):
    '''
    Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents
    as a styled HTML div. 
    '''
    pdfdata = urllib2.urlopen(pdfurl).read()
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    try:
        root = lxml.etree.fromstring(pdfxml)
    except lxml.etree.XMLSyntaxError, e:
        print str(e), str(type(e)).replace("<", "&lt;")
        print pdfurl
        print pdfxml.replace("<", "&lt;")
        root = []
Exemple #34
0
def scrape_pdf(url):
    '''
    Scrape data from PDF at URL.
    '''
    try:
        pdf_data = urlopen(url).read()
    except:
        return "#MISSING!"

    pdf_xml = scraperwiki.pdftoxml(pdf_data)
    root = lxml.etree.fromstring(pdf_xml)
    #pages = list(root)
    full_text = get_pdf_text(root)
    return full_text
Exemple #35
0
def process_pdf(pdfurl):
    pdfxml = u.findInCache(pdfurl,
                           verbose=True)  # look for html parse in cache
    if pdfxml is None:  # a html parse is not cached
        pdfdata = lazycache.lazycache(
            pdfurl,
            verbose=True)  # look for pdf document in cache, if not download
        pdfxml = scraperwiki.pdftoxml(pdfdata,
                                      "-hidden")  # parse pdf text to html
        u.putInCache(pdfurl, pdfxml, verbose=True)  # save cache of html parse

    beautifulxml = BeautifulSoup(
        pdfxml)  # convert html to BeautifulSoup(4) object

    for page in beautifulxml.find_all('page'):
        FIRSTPAGE = 6
        LASTPAGE = 6
        if int(page['number']) < FIRSTPAGE:
            continue
        if int(page['number']) == FIRSTPAGE:
            print "*******************************************"
            print "***** FIRSTPAGE #%d while developing ******" % (FIRSTPAGE)
            print "*******************************************"
        if int(page['number']) == LASTPAGE + 1:
            print "*******************************************"
            print "****** LASTPAGE #%d while developing ******" % (LASTPAGE)
            print "*******************************************"
            break

        print("*******************************************")
        print("********** Working on page #%s **********" % page['number'])
        print("*******************************************")
        elementList = deque(
            page.find_all('text'))  # we want to be able to use popleft
        d(elementList)
        while True:
            try:
                currElement = elementList.popleft()
                if "Innhold:" in currElement.text and currElement.b:  # we found a "Innhold:"-header
                    entry = parseDocumentRecord(currElement, elementList)
                    print entry
                    scraperwiki.sqlite.save(
                        unique_keys=["innhold", "sakstittel"], data=entry)
                    d("back in process_pdf")
                #else:
                #print currElement.text
            except IndexError, e:
                d("No more text elements on page (%s)" % e)
                break
Exemple #36
0
def Main(url):
    tmpfile = tempfile.gettempdir() + "/45_networkrail.zip"
    tmpdir = tempfile.gettempdir() + "/45_networkrail"
    #+str(random.randint(2, 1000000000));
    urllib.urlretrieve(url, tmpfile)

    with zipfile.ZipFile(tmpfile, 'r') as myzip:
        myzip.extractall(tmpdir)

    f = open(tmpdir + "/completeTimetable.pdf", 'r')
    pdfxml = scraperwiki.pdftoxml(f.read())
    #print(os.listdir(tmpdir));
    #print(pdfxml);
    root = lxml.etree.fromstring(pdfxml)
    print '<p>There are %d pages</p>' % len(root)
    print etree.tostring(root[0])
Exemple #37
0
def scrapepdf(url):
    #use the urllib2 library's .urlopen function to open the full PDF URL, and the .read() function to read it into a new object, 'pdfdata'
    pdfdata = urllib2.urlopen(url).read()
    #use pdftoxml to convert that into an xml document
    pdfread = scraperwiki.pdftoxml(pdfdata)
    print pdfread
    #use lxml.etree to convert that into an lxml object
    pdfroot = lxml.etree.fromstring(pdfread)
    #find all <text> tags and put in list variable 'lines'
    lines = pdfroot.findall('.//text')
    #create variable 'linenumber', initialised at 0
    linenumber = 0
    record = {}
    #loop through each item in 'lines' list
    for line in lines:
        #add one to 'linenumber' so we can track which line we're dealing with
        linenumber = linenumber + 1
        #if 'line' has some text:
        if line.text is not None:
            #create a new variable 'mention' that is filled with the result of
            #using the 're' library's .match function
            mention = re.search(r'.*black.*', line.text)
            if mention:
                print line.text
                #the RANGE function generates a list from the first parameter to the second,
                #e.g. range(5,8) would make [5, 6, 7] - it doesn't include the 'end' of the range
                #in this case we're using the line number minus 2, and the linenumber as our start and end points
                print range(linenumber - 2, linenumber + 1)
                linebefore = "EMPTY LINE"
                lineafter = "EMPTY LINE"
                incontextlist = []
                if pdfroot.xpath('.//text')[linenumber - 2].text:
                    linebefore = pdfroot.xpath('.//text')[linenumber - 2].text
                    incontextlist.append(linebefore)
                incontextlist.append(
                    pdfroot.xpath('.//text')[linenumber - 1].text)
                if pdfroot.xpath('.//text')[linenumber].text is not None:
                    lineafter = pdfroot.xpath('.//text')[linenumber].text
                    incontextlist.append(lineafter)
                print "mention.group()", mention.group()
                print "CAN YOU SEE ME?", ''.join(incontextlist)
                record["mention in context"] = ''.join(incontextlist)
                record["linenumber"] = linenumber
                #this stores the 'url' variable which is passed right at the start of this function: def scrapepdf(url):
                record["url"] = url
                print record
                scraperwiki.sqlite.save(["linenumber", "url"], record)
def scrape_and_look_for_next_link(url):
    html = scraperwiki.scrape(url)
    soup = BeautifulSoup(html)
    find_link = soup.findAll(
        href=re.compile("/downloads/pdf/crime_statistics/"))
    next_link = [None] * (len(find_link))
    rep_link = [None] * (len(find_link))
    for i in range(len(find_link)):
        next_link[i] = find_link[i]['href']
        rep_link[i] = next_link[i].replace('../..',
                                           'http://www.nyc.gov/html/nypd')
    for i in range(len(rep_link)):
        a = scraperwiki.scrape(
            rep_link[i]
        )  #here I call my previously defined function to convert and scrape the pdf
        soup_pdf = BeautifulSoup(scraperwiki.pdftoxml(a))
        scrape_table(soup_pdf)
Exemple #39
0
def ExtractPdf(year, nz, pdfbin, lurl):
    mnz = re.match("(...).*?(?:\d\d)?(\d\d)?_3.pdf", nz)
    assert mnz, nz
    assert mnz.group(1).lower() in m3, nz
    dnz = "%d-%02d" % (mnz.group(2) and int(mnz.group(2)) + 2000
                       or int(year), m3.index(mnz.group(1).lower()) + 1)
    #print "date", dnz
    root = lxml.etree.fromstring(scraperwiki.pdftoxml(pdfbin))
    currentcountry = None
    currentmission = None
    ldata = []
    data = None
    for page in list(root):
        rtblocks = []
        #print lxml.etree.tostring(page)
        for text in page:
            if text.tag != "text":
                continue

            if 130 <= int(text.attrib.get("left")) <= 140:
                #print lxml.etree.tostring(text)
                currentmission = None
                currentcountry = text_content(text).strip()
            if 276 <= int(text.attrib.get("left")) <= 280:
                if rtblocks and data:
                    lndata = parsemissionblock(rtblocks, data)
                    ldata.extend(lndata)
                currentmission = text_content(text).strip()
                data = {
                    "link": lurl,
                    "nz": nz,
                    "month": dnz,
                    "country": currentcountry,
                    "mission": currentmission,
                    "year": year
                }
                rtblocks = []
            if int(text.attrib.get("left")) > 350:
                rtblocks.append(text)

        if rtblocks and data:
            lndata = parsemissionblock(rtblocks, data)
            ldata.extend(lndata)
    scraperwiki.sqlite.save(["month", "country", "mission", "desc"], ldata)
    return dnz, len(ldata)
Exemple #40
0
def scrape_pdf(url):
    '''
    Scrape data from PDF at URL.
    '''
    try:
        pdf_data = urlopen(url).read()
    except:
        return (None, None, None)

    pdf_xml = scraperwiki.pdftoxml(pdf_data)
    root = lxml.etree.fromstring(pdf_xml)
    page0 = root.find('page')
    try:
        content = dict(parse(list(tokenize(page0))))
    except ParseError:
        content = None
    full_text = get_pdf_text(root)
    return pdf_xml, full_text, content
def scrapepdf(pdfurl):
    #print "scraping " + pdfurl
    pdfdata = urllib.urlopen(pdfurl).read()
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    s = BeautifulSoup(pdfxml)
    #print s

    casenr = None
    datestr = None
    daynr = None
    last_line = ""
    for idx, text in enumerate(s.findAll('text')):
        msg = text.text
        #print msg
        if 0 == msg.find(u"Møte "):
            datestr = datestr2date(msg.split("den ")[1].split(" kl.")[0])
            #print datestr
        if 0 == msg.find("D a g s o r d e n (nr."):
            daynr = msg.split(")")[0].split(".")[1]
            continue
        if -1 != msg.find("Votering i sak nr."):
            #print msg
            casenr = msg.split("nr.")[1].strip()
            continue
        elif -1 != msg.find("Votering i sak "):
            #print msg
            casenr = msg.split("i sak ")[1].strip()
            continue
        if -1 != msg.find("enstemmig bifalt") or -1 != msg.find(
                "enstemmig vedtatt") or (
                    (-1 != msg.find("bifalt") or -1 != msg.find("vedtatt"))
                    and -1 != last_line.find("ble enstemmig")):
            #print datestr, daynr, casenr, msg
            data = {
                'index': idx,
                'date': datestr,
                'daynr': daynr,
                'casenum': casenr,
                'msg': last_line + msg,
            }
            if casenr is not None:
                scraperwiki.sqlite.save(
                    unique_keys=['date', 'casenum', 'index'], data=data)
        last_line = msg
def maps():
    url = "http://www.atoc.org/clientfiles/File/Maps.pdf"

    print "Fetching maps"
    pdfdata = urllib2.urlopen(url).read()
    print "The pdf file has %d bytes" % len(pdfdata)


    print "Converting to XML"
    xmldata = scraperwiki.pdftoxml(pdfdata)
    #print "After converting to xml it has %d bytes" % len(xmldata)
    #print "The first 20000 characters are: ", xmldata[:20000]

    print "Converting PDF to PNGs"
    with tempfile.NamedTemporaryFile() as pdffile:
        pdffile.write(pdfdata)
        pdffile.flush()
        tmpdir = tempfile.mkdtemp()

        subprocess.check_call(['pdftoppm', '-r', '75', '-png',
            pdffile.name, os.path.join(tmpdir, 'p')])

    print "Parsing XML"
    root = lxml.etree.fromstring(xmldata)

    print "Processing maps"
    maptitles = root.xpath('//text[@height=100]')

    scraperwiki.sqlite.execute('DROP TABLE IF EXISTS maps')
    scraperwiki.sqlite.execute('CREATE TABLE maps (mapname, pageno, data)')

    for maptitle in maptitles:
        pageno = int(maptitle.xpath('string(../@number)'))
        with open(os.path.join(tmpdir, 'p-%03d.png' % (pageno)), 'rb') as f:
            scraperwiki.sqlite.execute('INSERT INTO maps VALUES (?,?,?)',
                (maptitle.xpath('string()'), pageno, base64.b64encode(f.read())))

    print "Creating index"
    scraperwiki.sqlite.execute('CREATE INDEX maps_bymap ON maps(mapname, pageno)')
    print "Committing maps"
    scraperwiki.sqlite.commit()
    print "Maps processed"
def yellow_pages():
    #
    # Yellow pages
    #
    # This file is huge, so we do the XML parsing incrementally.
    #

    print "Loading yellow pages (permitted route list)"
    url = "http://www.atoc.org/clientfiles/File/permitted_route_identifier.pdf"

    pdfdata = urllib2.urlopen(url).read()
    print "The pdf file has %d bytes" % len(pdfdata)

    print "Converting to XML"
    xmldata = scraperwiki.pdftoxml(pdfdata)
    # print "After converting to xml it has %d bytes" % len(xmldata)
    # print "The first 20000 characters are: ", xmldata[:20000]

    orig = None
    dest = None

    scraperwiki.sqlite.execute('DROP TABLE IF EXISTS permitted_routes')
    scraperwiki.sqlite.execute('CREATE TABLE permitted_routes (orig, dest, maps)')

    print "Processing XML"
    # This is horrid, and assumes that the PDF will be in the correct order.
    for _, cell in lxml.etree.iterparse(StringIO.StringIO(xmldata), tag='text'):
        if cell.attrib['height'] == '10':
            if cell.attrib['left'] == '80':
                orig = cell.xpath('string()')
            elif cell.attrib['left'] == '208':
                dest = cell.xpath('string()')
            else:
                scraperwiki.sqlite.execute('INSERT INTO permitted_routes VALUES (?, ?, ?)',
                    (orig, dest, cell.xpath('string()')))
        cell.clear()

    print "Creating index"
    scraperwiki.sqlite.execute('CREATE INDEX routes_bystn ON permitted_routes(orig, dest)')
    print "Committing"
    scraperwiki.sqlite.commit()
    print "Yellow pages done"
def process_pdf( url ):
    print "PROCESSING: " , url, 
    pdfdata = urllib2.urlopen(url).read()
    print len( pdfdata ), "bytes"
    if len(pdfdata) > 50000:
        return "" #too BIG Daddio!

    str = ''
    xmldata = scraperwiki.pdftoxml(pdfdata)

    root = lxml.etree.fromstring(xmldata)
    pages = list(root)


    def gettext_with_bi_tags(el):
        res = [ ]
        if el.text:
            res.append(el.text)
        for lel in el:
            res.append("<%s>" % lel.tag)
            res.append(gettext_with_bi_tags(lel))
            res.append("</%s>" % lel.tag)
            if el.tail:
                res.append(el.tail)
        return "".join(res)

    for page in pages :
        print page.attrib.get("number")
        # print the first hundred text elements from the first page
        page0 = pages[0]
        i = []
        data = []
        for el in list(page)[:1000]:
            if el.tag == "text":
                 data = {}
                 text = strip_tags( gettext_with_bi_tags(el) )
                 #data['text'] =  text
                 #data['url'] = url # The source of these words 
                 if text != '' and text != ' ':
                     #scraperwiki.sqlite.save(i, data)
                     str += " " + text
    return str                     
Exemple #45
0
def getheadingsfrompdf(pdfurl):
    pdfdata = urllib2.urlopen(pdfurl).read()
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    root = lxml.etree.fromstring(pdfxml)

    ldata = []
    for page in root:
        for el in page:
            # needs also to do concatenation between headings that run to two lines,
            # and handle headings with italics in them <i>
            if el.tag == "text" and el.attrib.get("font") == "10" and len(
                    el) == 1 and el[0].tag == "b":
                data = {
                    "pdfurl": pdfurl,
                    "pagenumber": int(page.attrib.get("number")),
                    "heading": el[0].text
                }
                ldata.append(data)
    scraperwiki.sqlite.save(["pdfurl", "pagenumber", "heading"], ldata,
                            "subheadings")
Exemple #46
0
def scrape():
    u = file("cho-1-elementary.pdf")

    x = scraperwiki.pdftoxml(u.read())
    soup = BeautifulSoup(x)
    book = soup.get_text().split('\n')

    page = []
    newpage = []
    for x in book[36:]:
        newpage += [x]
        if x == '':
            pass
        elif x[0] == 'p':
            page += [newpage]

            newpage = []
        elif x[0] == '<':
            newpage = [x]
    return page
Exemple #47
0
def iter_areas():
    import scraperwiki
    import StringIO

    pdfurl = "http://www.appc.org.uk/appc/filemanager/root/site_assets/pdfs/appc_register_entry_for_1_december_2009_to_28_february_2010.pdf"
    pdf = scraperwiki.scrape(pdfurl)
    print "Converting pdf to xml"
    xml = scraperwiki.pdftoxml(pdf)
    print "got xml"
    xmlfd = StringIO.StringIO(xml)
    doc = PdfToHTMLOutputParser(xmlfd)
    print "got doc"

    #import sys
    #doc = PdfToHTMLOutputParser(open(sys.argv[1]))

    org = {}
    grouper = TextGrouper()
    grouper.add_patterns(
        (re.compile("APPC register entry ", re.IGNORECASE), "dates"),
        ("Address(es) in UK", "address"),
        ("Address in UK", "address"),
        ("Contact", "contact"),
        ("Offices outside UK", "section"),
        (re.compile("providing PA consultancy services",
                    re.IGNORECASE), "section"),
        (re.compile("clients for whom", re.IGNORECASE), "section"),
    )

    def font_0(item):
        if item.fontspec.number == 0:
            item.props['type'] = 'name'
            item.props['grabbottom'] = 20
            print "Marked title:", repr(item.text)

    grouper.special_fns.append(font_0)
    grouper.group(doc.text(merge_verticals=True))
    #grouper.display()
    #grouper.display_full()
    for area in grouper.areas:
        yield area
def fetch_record(url):
    f = requests.get(url)
    pdf = scraperwiki.pdftoxml(f.content)
    root = lxml.etree.fromstring(pdf)
    texts = root.xpath("//text")
    rows = {}
    for text in texts:
        top = int(text.xpath("./@top")[0]) / 10 * 10
        left = int(text.xpath("./@left")[0])
        value = text.text.strip()
        if top not in rows:
            rows[top] = []
        rows[top].append(value)
    rows_sorted = [rows[key] for key in sorted(rows.keys())]
    first_row = rows_sorted[0][0]
    words = first_row.split(" ")
    month = list(calendar.month_name).index(words[-2])
    if month == 0:
        raise Exception("Cannot parse month")
    year = int(words[-1])

    rows_sorted = rows_sorted[1:-1]
    header = [convert(s) for s in rows_sorted[0]]
    num_rows = len(header)
    for i in range(1, len(rows_sorted)):
        row = rows_sorted[i]
        k = num_rows - len(row)
        if k > 0:
            padding = [""] * k
            rows_sorted[i] = padding + rows_sorted[i]
    for i in range(1, len(rows_sorted)):
        if len(rows_sorted[i][0]) == 0:
            rows_sorted[i][0] = rows_sorted[i - 1][0]

        d = {"year": year, "month": month}
        for j in range(0, len(header)):
            d[header[j]] = rows_sorted[i][j]
        scraperwiki.sqlite.save(unique_keys=['year', 'month', 'district'],
                                data=d)
        print d
Exemple #49
0
def scrape_cieisp(year, text):
    if (year == 2010):
        pdfurl = "http://www.secretariadoejecutivosnsp.gob.mx/work/models/SecretariadoEjecutivo/Resource/131/1/images/INCIDENCIA_DELICTIVA_2010_030211.pdf"
    else:
        pdfurl = "http://www.secretariadoejecutivosnsp.gob.mx/work/models/SecretariadoEjecutivo/Resource/131/1/images/CIEISP" + ` year ` + ".pdf"
    a = scraperwiki.scrape(pdfurl)
    s = BeautifulSoup(scraperwiki.pdftoxml(a))

    dolosos_position = []
    i = 0
    for t in s.findAll('text'):
        if t.text == "DOLOSOS":
            if text == "POR ARMA DE FUEGO":
                dolosos_position.append(i + 14)
            else:
                dolosos_position.append(i)
        i += 1

    all_text = s.findAll('text')
    #print all_text

    if (year <= 2008):
        if (year >= 2006):
            states_names = states3
        else:
            states_names = states2
    else:
        states_names = states

    for i in range(0, 33):
        for j in range(1, 14):
            record = {
                'State': states_names[i],
                'Year': year,
                'Month': months[j - 1],
                'Homicides': all_text[dolosos_position[i] + j].text,
                'Crimetype': text
            }
            scraperwiki.datastore.save(["State", "Year", "Month"], record)
    return
    def get_id_period (self, date):

        from_iso_dt, to_iso_dt = util.inc_dt(date.strftime(util.ISO8601_DATE), util.ISO8601_DATE, self.PERIOD_TYPE)
        from_dt = util.get_dt(from_iso_dt, util.ISO8601_DATE)
        to_dt = util.get_dt(to_iso_dt, util.ISO8601_DATE)

        url_date = to_dt.strftime(self.search_url)
        if self.DEBUG: print url_date
        try:
            response = self.br.open(url_date)
        except:
            response = None

        final_result = []
        if response:
            pdfxml = scraperwiki.pdftoxml(response.read())
            if self.DEBUG: print pdfxml
            url = response.geturl()
            result = scrapemark.scrape(self.scrape_ids1, pdfxml, url)
            if not result or not result.get('records'):
                result = scrapemark.scrape(self.scrape_ids2, pdfxml, url)
            if result and result.get('records'):
                for rec in result['records']:
                    rec['url'] = url_date
                    rec['start_date'] = rec['date_received']
                    try:
                        map_ref_list = rec['os_map_ref'].split()
                        rec['easting'] = map_ref_list[0]
                        rec['northing'] = map_ref_list[1]
                        del rec['os_map_ref']
                    except:
                        pass
                self.clean_ids(result['records'])
                for rec in result['records']: # note do this after record cleaning
                    rec['date_scraped'] = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
                final_result.extend(result['records'])
        #else:
        #    return [], None, None

        return final_result, from_dt, to_dt # note weekly result might some times be legitimately empty
def Main(pdfurl):
    '''
    Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents
    as a styled HTML div. 
    '''
    pdfdata = urllib2.urlopen(pdfurl).read()
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    root = lxml.etree.fromstring(pdfxml)

    global styles
    fontspecs = {}

    # Get the PDF's internal styles: we'll use these to style the divs containing the PDF.
    for fontspec in root.xpath('page/fontspec'):
        id = fontspec.attrib.get('id')
        fontdesc = {
            'size': int(fontspec.attrib.get('size')),
            'family': fontspec.attrib.get('family'),
            'color': fontspec.attrib.get('color')
        }
        fontspecs[id] = fontdesc
        styles['div.fontspec-%s' %
               id] = 'color:%s;font-family:%s;font-size:%dpx' % (
                   fontdesc['color'], fontdesc['family'], fontdesc['size'])

    # Output the view, with instructions for the user.
    print '<html dir="ltr" lang="en">'
    print '<head>'
    print '    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
    print '    <title>PDF to XML text positioning</title>'
    print '    <style type="text/css" media="screen">%s</style>' % "\n".join(
        ["%s { %s }" % (k, v) for k, v in styles.items()])
    print '    <script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script>'
    print '    <script>%s</script>' % jscript
    print '</head>'

    # Print each page of the PDF.
    for index, page in enumerate(root):
        print Pageblock(page, index)
Exemple #52
0
def scrapereport(reportlink):
    boldline = 0
    html = scraperwiki.scrape(baseurl + reportlink)
    root = lxml.html.fromstring(html)
    links = root.cssselect("div#unusefulbottom a")
    #<div id="unusefulbottom">
    for link in links:
        print "LINK GRABBED WITH CSSSELECT", link
        print "link.attrib.get", link.attrib.get('href')
        downloadlink = link.attrib.get('href')
        #    print " downloadlink[0].text_content()", downloadlink[0].text_content()
        pdfdata = urllib2.urlopen(baseurl + downloadlink).read()
        print "pdfdata", pdfdata
        xmldata = scraperwiki.pdftoxml(pdfdata)
        print "xmldata", xmldata
        pdfxml = lxml.etree.fromstring(xmldata)
        print "pdfxml", pdfxml
        boldtags = pdfxml.xpath('.//text')
        linenumber = 0
        for heading in boldtags:
            linenumber = linenumber + 1
            #print "Heading:", heading.text
            if heading.text is not None:
                #                mention = re.match(r'.*NMS.*',heading.text)
                mention = re.match(r'.*overall.*', heading.text)
                if mention:
                    print "FULL LINE", lxml.etree.tostring(heading,
                                                           encoding="unicode",
                                                           method="text")
                    #                    print "OVERALL", heading.text
                    #                    print "CHECK", pdfxml.xpath('.//text')[linenumber-1].text
                    #                    print "LINEAFTER", pdfxml.xpath('.//text')[linenumber].text
                    record['overall'] = lxml.etree.tostring(heading,
                                                            encoding="unicode",
                                                            method="text")
                    record['uniqueref'] = reportlink + "_" + str(boldline)
                    record['downloadlink'] = baseurl + downloadlink
                    scraperwiki.sqlite.save(['uniqueref'], record)
def carregaPagina(url):
    print url
    pdfdata = urllib2.urlopen(url).read()
    #print "The pdf file has %d bytes" % len(pdfdata)
    
    xmldata = scraperwiki.pdftoxml(pdfdata)
    #print "After converting to xml it has %d bytes" % len(xmldata)
    print "The first 5000 characters are: ", xmldata[:5000]
    
    root = lxml.etree.fromstring(xmldata)
    pages = list(root)
    print "The pages are numbered:", [ page.attrib.get("number")  for page in pages ]

    arquivo = url.replace("https://www.fazenda.sp.gov.br/SigeoLei131/Paginas/Arquivos/", "")
    cabecalho = True

    for page in pages: #[:1]
        data = {}
        conta = 0
        coluna = 0
        for el in list(page)[:110]: #[:100]
            if el.tag == "text":
                data[colunas[coluna]] = el.text.strip()
                coluna = coluna + 1
                if coluna >= len(colunas):
                    if not cabecalho:
                        data['arquivo'] = arquivo
                        #print el.attrib['left'], el.text
                        if conta < 10: print data
                        scraperwiki.datastore.save(["arquivo", "nome", "cargo", "municipio"], data)
                        conta = conta + 1
                        data = {}

                    cabecalho = False
                    coluna = 0

        print "Pagina %s: %s registro(s)" % (page.attrib.get("number"), conta)
Exemple #54
0
def main():
    url="http://www.freedomhouse.org/sites/default/files/Freedom%20OnThe%20Net_Full%20Report.pdf"
    pdfdata = urllib2.urlopen(url).read()
    xmldata = scraperwiki.pdftoxml(pdfdata)
    
    goodpages=[27,28,29]

    rootdata=lxml.etree.fromstring(xmldata)
    pages = list(rootdata)

    page=pages[23]
    alltext=getText(page)
    dict1(alltext)
    for i in goodpages:
        page=pages[i]
        alltext=getText(page)
        dict2(alltext)
    
        pagenumbers=[30,36,47,53, 57, 61, 65, 72, 78, 82, 87, 97, 102, 108]
        Country=['Brazil', 'China', 'Cuba', 'Egypt', 'Estonia', 'Georgia', 'India', 'Iran', 'Kenya', 'Malaysia','Russia','Tunisia', 'Turkey', 'UK']
    for i in range(len(pagenumbers)):
        page=pages[pagenumbers[i]]
        alltext=getText(page)
        PageInfo(alltext, Country[i])
import scraperwiki
import urllib2
import lxml.etree
import bs4

url = "http://dget.nic.in/ItiUpgradePPP/list%20of%20%20ITIs%20only%20wth%20industry%20partners10-11.pdf"
pdfdata = urllib2.urlopen(url).read()
#print "The pdf file has %d bytes" % len(pdfdata)

xmldata = scraperwiki.pdftoxml(pdfdata)
#print "After converting to xml it has %d bytes" % len(xmldata)
#print "The first 2000 characters are: ", xmldata[:2000]

root = lxml.etree.fromstring(xmldata)
print xmldata
soup = bs4.BeautifulSoup(xmldata)
#print soup
start = False
ITI = True
sl_no = 0
for link in soup.find_all('text'):
    #print link.textcontent()
    #print link.get_text()
    #print str(start)
    text = link.get_text()
    text = text.replace(',', ' ')
    if start:
        if len(text) > 4 or text.count('NIL') > 0:
            #print text
            #if text.count('(ITI-')>0:
            #continue
Exemple #56
0
    geocode = simplejson.loads(geo_response.read())
    print geocode_url
    print geocode
    #Google imposes query limits, this lets us pass a failure and have the loop sleep and try again after 2 seconds
    if geocode['status'] == "OVER_QUERY_LIMIT":
        return 0
    if geocode['status'] != 'ZERO_RESULTS':
        coord_lat = geocode['results'][0]['geometry']['location']['lat']
        coord_lon = geocode['results'][0]['geometry']['location']['lng']
        coord.append(coord_lat)
        coord.append(coord_lon)
    print coord
    return coord

url = "https://www.denvergov.org/Portals/707/documents/mydenverdrive/1-22-25-2013.pdf"
xml = scraperwiki.pdftoxml(urllib2.urlopen(url).read())
parsed = BeautifulSoup(xml).text.split("\n")
filtered_list = parsed[parsed.index('Location: '):]
closures = []

i = 0
current_closure = -1

while i < len(filtered_list):
    text = filtered_list[i]
    if text == "Location: ":
        closures.append({})
        current_closure = len(closures) - 1
        i += 1
        closures[current_closure]['location'] = filtered_list[i]
        #print filtered_list[i]
# Blank Python
import sys
import scraperwiki
import urllib
import lxml.etree, lxml.html
import re

# for the geocode
from geopy import geocoders

import json

pdfurl = "http://www.nikebiz.com/responsibility/documents/factory_disclosure_list.pdf"

pdfdata = urllib.urlopen(pdfurl).read()
pdfxml = scraperwiki.pdftoxml(pdfdata)
root = lxml.etree.fromstring(pdfxml)

g = geocoders.Google(
    'ABQIAAAAJWpc-texCflE7mMP0dgMGRTudD1_fegkcYIvU14JimqYoyT2khRxYTlCvIBPJApaoqvk4JfEfbrhyg'
)

for page in root:
    assert page.tag == 'page'
    #print "page details", page.attrib
    pagelines = {}
    pagedata = {}
    for v in page:
        if v.tag == 'text':
            text = re.match(
                '(?s)<text.*?>(.*?)</text>', lxml.etree.tostring(v)).group(
Exemple #58
0
from urllib2 import urlopen
from lxml.html import fromstring, tostring
import lxml.etree

def get_pdf_list():
    raw = urlopen('http://www.dropbox.com/sh/gpi0ejooop07x8a/bMDz4s9Ixp').read()
    html = fromstring(raw)
    
    a_elements = html.cssselect('li.browse-file.list-view-cols div.filename-col a')
    pdf_urls = [a.attrib['href'] + '?dl=1' for a in a_elements]
    return pdf_urls

test_url = 'https://www.dropbox.com/sh/gpi0ejooop07x8a/qJxWkjx8fz/ENDA-HR1858-June1997.pdf?dl=1'

raw_pdf = urlopen(test_url).read()
pdfxml = lxml.etree.fromstring(pdftoxml(raw_pdf))

rawtext = pdfxml.xpath('string()').replace('\n', ' ')
print rawtextfrom scraperwiki import pdftoxml
from urllib2 import urlopen
from lxml.html import fromstring, tostring
import lxml.etree

def get_pdf_list():
    raw = urlopen('http://www.dropbox.com/sh/gpi0ejooop07x8a/bMDz4s9Ixp').read()
    html = fromstring(raw)
    
    a_elements = html.cssselect('li.browse-file.list-view-cols div.filename-col a')
    pdf_urls = [a.attrib['href'] + '?dl=1' for a in a_elements]
    return pdf_urls
def get_root(filename):
  f=open(filename,"rb")
  return lxml.html.fromstring(scraperwiki.pdftoxml("".join(f)))
Exemple #60
0
import scraperwiki
from BeautifulSoup import BeautifulSoup
import time
import urllib

urltemplate = 'http://www.fco.gov.uk/resources/en/protocol/ldl-'
date = time.strftime('%A %d %B %Y')
month = (date.split(' '))[2]
year = (date.split(' '))[3]

#url = urltemplate + month + year
url = 'http://www.fco.gov.uk/resources/en/protocol/ldl-August2010'
pdfinput = urllib.urlopen(url)
print 'got pdf'
scraped = scraperwiki.pdftoxml(pdfinput.read())
print 'pdftohtml complete'
output = []


def getlastrow():
    l = len(output)
    r = output[(l)]
    return r


print 'finished setup'
soup = BeautifulSoup(scraped)
print 'soup cooked'
# this document is a right dog