def scrapeData(self, rawtext): """ Does the gritty data extraction from HTML returns a dictionary of what it found """ scrapeddata = {} pagetext = '<html>' + rawtext[rawtext.find('output.html - end'):] soup = BeautifulSoup(pagetext) marker = soup.find(hastext('b', '(?m)No.\W+de\W+Ficha')) or nosoup scrapeddata['nodeficha'] = tagtext(marker.findNext('p')) scrapeddata['nodocumento'] = tagtext( soup.find(hastext('b', '(?m)No.\W+Documento')).findNext('p')) scrapeddata['nombredelasociedad'] = tagtext( soup.find(hastext( 'b', '(?m)Nombre de la Funda')).findNext('table')).strip() #scrapeddata['Tomo'] = smallHead(soup, 'Tomo:') #scrapeddata['Folio'] = smallHead(soup, 'Folio:') #scrapeddata['Asiento'] = smallHead(soup, 'Asiento:') scrapeddata['registerdate'] = self.smallHead(soup, 'Fecha de Registro:') scrapeddata['agent'] = self.smallHead(soup, 'Agente Residente') headings = ( #basics 'Fecha de Registro', #date registered 'Status', #status # Apostiled document (???) 'No. de Escritura', # signed document number 'Fecha de Escritura', # date of signed documents # registration details 'Notaria', #may be odd 'Provincia Notaria', #notary provice 'Duraci.n', #duration 'Domicilio', #domicile # Capital 'Moneda', # currency 'Monto de Capital', # amount of capital #tax details 'Fecha de Pago', # date paid company tax 'Agente Residente', #resident agent #'Status de la Prenda', ) #for heading in headings: # scrapeddata[heading] = smallHead(soup, heading) #scrapeddata['representantelegal'] = tagtext(soup.find(hastext('font', '(?m)Representante Legal')).findNext('table')).strip() scrapeddata['titles'] = self.dictOfTitles(soup) scrapeddata['directors'] = self.listOfDirectors(soup) scrapeddata['suscriptores'] = self.listFromTable( soup, 'Nombre de los Suscriptores') #scrapeddata['capital'] = self.listFromTable(soup, 'Capital') for item in scrapeddata: if not (isinstance(scrapeddata[item], unicode)): print(scrapeddata[item]) return scrapeddata
def scrapeData(self, rawtext): """ Does the gritty data extraction from HTML returns a dictionary of what it found """ scrapeddata = {} pagetext = '<html>' + rawtext[rawtext.find('output.html - end'):] soup = BeautifulSoup(pagetext) marker = soup.find(hastext('b', '(?m)No.\W+de\W+Ficha')) or nosoup scrapeddata['nodeficha'] = tagtext(marker.findNext('p')) scrapeddata['nodocumento'] = tagtext(soup.find(hastext('b', '(?m)No.\W+Documento')).findNext('p')) scrapeddata['nombredelasociedad'] = tagtext(soup.find(hastext('b', '(?m)Nombre de la Funda')).findNext('table')).strip() #scrapeddata['Tomo'] = smallHead(soup, 'Tomo:') #scrapeddata['Folio'] = smallHead(soup, 'Folio:') #scrapeddata['Asiento'] = smallHead(soup, 'Asiento:') scrapeddata['registerdate'] = self.smallHead(soup, 'Fecha de Registro:') scrapeddata['agent'] = self.smallHead(soup, 'Agente Residente') headings = ( #basics 'Fecha de Registro', #date registered 'Status', #status # Apostiled document (???) 'No. de Escritura', # signed document number 'Fecha de Escritura', # date of signed documents # registration details 'Notaria', #may be odd 'Provincia Notaria', #notary provice 'Duraci.n', #duration 'Domicilio', #domicile # Capital 'Moneda', # currency 'Monto de Capital', # amount of capital #tax details 'Fecha de Pago', # date paid company tax 'Agente Residente', #resident agent #'Status de la Prenda', ) #for heading in headings: # scrapeddata[heading] = smallHead(soup, heading) #scrapeddata['representantelegal'] = tagtext(soup.find(hastext('font', '(?m)Representante Legal')).findNext('table')).strip() scrapeddata['titles'] = self.dictOfTitles(soup) scrapeddata['directors'] = self.listOfDirectors(soup) scrapeddata['suscriptores'] = self.listFromTable(soup, 'Nombre de los Suscriptores') #scrapeddata['capital'] = self.listFromTable(soup, 'Capital') for item in scrapeddata: if not(isinstance(scrapeddata[item], unicode)): print(scrapeddata[item]) return scrapeddata
def dictOfTitles(self, soup): table = soup.find(hastext('font', 'T.tulo del Dignatario')).findNext('table') titledict = {} for row in table.findAll('tr'): cells = row.findAll('td') if len(cells) != 2: print('unexpected table layout: ' + row.prettify()) title = tagtext(cells[0]) name = tagtext(cells[1]) if name and title: titledict[title] = name return titledict
def scrapeData(self, rawtext): """ Does the gritty data extraction from HTML returns a dictionary of what it found """ scrapeddata = {} pagetext = '<html>' + rawtext[rawtext.find('output.html - end'):] soup = BeautifulSoup(pagetext) marker = soup.find(hastext('b', '(?m)No.\W+de\W+Ficha')) or nosoup scrapeddata['nodeficha'] = tagtext(marker.findNext('p')) scrapeddata['nodocumento'] = tagtext( soup.find(hastext('b', '(?m)No.\W+Documento')).findNext('p')) scrapeddata['nombredelasociedad'] = tagtext( soup.find(hastext( 'b', '(?m)Nombre de la Sociedad')).findNext('table')).strip() #scrapeddata['Tomo'] = smallHead(soup, 'Tomo:') #scrapeddata['Folio'] = smallHead(soup, 'Folio:') #scrapeddata['Asiento'] = smallHead(soup, 'Asiento:') scrapeddata['registerdate'] = self.smallHead(soup, 'Fecha de Registro:') scrapeddata['agent'] = self.smallHead(soup, 'Agente Residente') headings = ( 'Fecha de Registro', 'Status', 'No. de Escritura', 'Notaria', #may be odd 'Provincia Notaria', 'DuraciĆ³n', 'Domicilio', #'Status de la Prenda', ) for heading in headings: scrapeddata[heading] = self.smallHead(soup, heading) scrapeddata['status_de_la_prenda'] = self.wideHead(soup, 'Prenda') scrapeddata['titles'] = self.dictOfTitles(soup) scrapeddata['directors'] = self.listOfDirectors(soup) scrapeddata['suscriptores'] = self.listFromTable( soup, 'Nombre de los Suscriptores') # we don't really care about the boilerplate explanations of share # distribution # scrapeddata['capital'] = self.listFromTable(soup, 'Capital') # scrapeddata['representantelegal'] = tagtext(soup.find(hastext('font', '(?m)Representante Legal')).findNext('table')).strip() for item in scrapeddata: if not (isinstance(scrapeddata[item], unicode)): print(scrapeddata[item]) return scrapeddata
def listFromTable(self, soup, term): table = soup.find(hastext('font', term)).findNext('table') directors = [] for row in table.findAll('tr'): text = tagtext(row).strip() if text: directors.append(text) return directors
def scrapeData(self, rawtext): """ Does the gritty data extraction from HTML returns a dictionary of what it found """ scrapeddata = {} pagetext = '<html>' + rawtext[rawtext.find('output.html - end'):] soup = BeautifulSoup(pagetext) marker = soup.find(hastext('b', '(?m)No.\W+de\W+Ficha')) or nosoup scrapeddata['nodeficha'] = tagtext(marker.findNext('p')) scrapeddata['nodocumento'] = tagtext(soup.find(hastext('b', '(?m)No.\W+Documento')).findNext('p')) scrapeddata['nombredelasociedad'] = tagtext(soup.find(hastext('b', '(?m)Nombre de la Sociedad')).findNext('table')).strip() #scrapeddata['Tomo'] = smallHead(soup, 'Tomo:') #scrapeddata['Folio'] = smallHead(soup, 'Folio:') #scrapeddata['Asiento'] = smallHead(soup, 'Asiento:') scrapeddata['registerdate'] = self.smallHead(soup, 'Fecha de Registro:') scrapeddata['agent'] = self.smallHead(soup, 'Agente Residente') headings = ( 'Fecha de Registro', 'Status', 'No. de Escritura', 'Notaria', #may be odd 'Provincia Notaria', 'DuraciĆ³n', 'Domicilio', #'Status de la Prenda', ) for heading in headings: scrapeddata[heading] = self.smallHead(soup, heading) scrapeddata['status_de_la_prenda'] = self.wideHead(soup, 'Prenda') scrapeddata['titles'] = self.dictOfTitles(soup) scrapeddata['directors'] = self.listOfDirectors(soup) scrapeddata['suscriptores'] = self.listFromTable(soup, 'Nombre de los Suscriptores') # we don't really care about the boilerplate explanations of share # distribution # scrapeddata['capital'] = self.listFromTable(soup, 'Capital') # scrapeddata['representantelegal'] = tagtext(soup.find(hastext('font', '(?m)Representante Legal')).findNext('table')).strip() for item in scrapeddata: if not(isinstance(scrapeddata[item], unicode)): print(scrapeddata[item]) return scrapeddata
def wideHead(self, soup, term): marker = soup.find(hastext('font', term)) or nosoup value = tagtext(marker.findNext('td', width='49%')).strip() return value or BADDATA