Exemple #1
0
    def scrapeData(self, rawtext):
        """
        Does the gritty data extraction from HTML
        returns a dictionary of what it found
        """
        scrapeddata = {}
        pagetext = '<html>' + rawtext[rawtext.find('output.html - end'):]
        soup = BeautifulSoup(pagetext)
        marker = soup.find(hastext('b', '(?m)No.\W+de\W+Ficha')) or nosoup
        scrapeddata['nodeficha'] = tagtext(marker.findNext('p'))
        scrapeddata['nodocumento'] = tagtext(
            soup.find(hastext('b', '(?m)No.\W+Documento')).findNext('p'))
        scrapeddata['nombredelasociedad'] = tagtext(
            soup.find(hastext(
                'b', '(?m)Nombre de la Funda')).findNext('table')).strip()
        #scrapeddata['Tomo'] = smallHead(soup, 'Tomo:')
        #scrapeddata['Folio'] = smallHead(soup, 'Folio:')
        #scrapeddata['Asiento'] = smallHead(soup, 'Asiento:')
        scrapeddata['registerdate'] = self.smallHead(soup,
                                                     'Fecha de Registro:')
        scrapeddata['agent'] = self.smallHead(soup, 'Agente Residente')
        headings = (

            #basics
            'Fecha de Registro',  #date registered
            'Status',  #status

            # Apostiled document (???)
            'No. de Escritura',  # signed document number
            'Fecha de Escritura',  # date of signed documents

            # registration details
            'Notaria',  #may be odd
            'Provincia Notaria',  #notary provice
            'Duraci.n',  #duration
            'Domicilio',  #domicile

            # Capital
            'Moneda',  # currency
            'Monto de Capital',  # amount of capital

            #tax details
            'Fecha de Pago',  # date paid company tax
            'Agente Residente',  #resident agent

            #'Status de la Prenda',
        )
        #for heading in headings:
        #    scrapeddata[heading] = smallHead(soup, heading)
        #scrapeddata['representantelegal'] = tagtext(soup.find(hastext('font', '(?m)Representante Legal')).findNext('table')).strip()
        scrapeddata['titles'] = self.dictOfTitles(soup)
        scrapeddata['directors'] = self.listOfDirectors(soup)
        scrapeddata['suscriptores'] = self.listFromTable(
            soup, 'Nombre de los Suscriptores')
        #scrapeddata['capital'] = self.listFromTable(soup, 'Capital')
        for item in scrapeddata:
            if not (isinstance(scrapeddata[item], unicode)):
                print(scrapeddata[item])
        return scrapeddata
    def scrapeData(self, rawtext):
        """
        Does the gritty data extraction from HTML
        returns a dictionary of what it found
        """
        scrapeddata = {}
        pagetext = '<html>' + rawtext[rawtext.find('output.html - end'):]
        soup = BeautifulSoup(pagetext)
        marker = soup.find(hastext('b', '(?m)No.\W+de\W+Ficha')) or nosoup
        scrapeddata['nodeficha'] = tagtext(marker.findNext('p'))
        scrapeddata['nodocumento'] = tagtext(soup.find(hastext('b', '(?m)No.\W+Documento')).findNext('p'))
        scrapeddata['nombredelasociedad'] = tagtext(soup.find(hastext('b', '(?m)Nombre de la Funda')).findNext('table')).strip()
        #scrapeddata['Tomo'] = smallHead(soup, 'Tomo:')
        #scrapeddata['Folio'] = smallHead(soup, 'Folio:')
        #scrapeddata['Asiento'] = smallHead(soup, 'Asiento:')
        scrapeddata['registerdate'] = self.smallHead(soup, 'Fecha de Registro:')
        scrapeddata['agent'] = self.smallHead(soup, 'Agente Residente')
        headings = (
            
            #basics
            'Fecha de Registro', #date registered
            'Status', #status
            
            # Apostiled document (???)
            'No. de Escritura',  # signed document number
            'Fecha de Escritura', # date of signed documents
            
            # registration details
            'Notaria', #may be odd
            'Provincia Notaria', #notary provice
            'Duraci.n', #duration
            'Domicilio', #domicile

            # Capital
            'Moneda', # currency
            'Monto de Capital', # amount of capital

            #tax details
            'Fecha de Pago', # date paid company tax
            'Agente Residente', #resident agent

            #'Status de la Prenda',
            )
        #for heading in headings:
        #    scrapeddata[heading] = smallHead(soup, heading)
        #scrapeddata['representantelegal'] = tagtext(soup.find(hastext('font', '(?m)Representante Legal')).findNext('table')).strip()
        scrapeddata['titles'] = self.dictOfTitles(soup)
        scrapeddata['directors'] = self.listOfDirectors(soup)
        scrapeddata['suscriptores'] = self.listFromTable(soup, 'Nombre de los Suscriptores')
        #scrapeddata['capital'] = self.listFromTable(soup, 'Capital')
        for item in scrapeddata:
            if not(isinstance(scrapeddata[item], unicode)):
                print(scrapeddata[item])
        return scrapeddata
 def dictOfTitles(self, soup):
     table = soup.find(hastext('font', 'T.tulo del Dignatario')).findNext('table')
     titledict = {}
     for row in table.findAll('tr'):
         cells = row.findAll('td')
         if len(cells) != 2:
             print('unexpected table layout: ' + row.prettify())
         title = tagtext(cells[0])
         name = tagtext(cells[1])
         if name and title:
             titledict[title] = name
     return titledict
Exemple #4
0
 def dictOfTitles(self, soup):
     table = soup.find(hastext('font',
                               'T.tulo del Dignatario')).findNext('table')
     titledict = {}
     for row in table.findAll('tr'):
         cells = row.findAll('td')
         if len(cells) != 2:
             print('unexpected table layout: ' + row.prettify())
         title = tagtext(cells[0])
         name = tagtext(cells[1])
         if name and title:
             titledict[title] = name
     return titledict
Exemple #5
0
    def scrapeData(self, rawtext):
        """
        Does the gritty data extraction from HTML
        returns a dictionary of what it found
        """
        scrapeddata = {}
        pagetext = '<html>' + rawtext[rawtext.find('output.html - end'):]
        soup = BeautifulSoup(pagetext)
        marker = soup.find(hastext('b', '(?m)No.\W+de\W+Ficha')) or nosoup
        scrapeddata['nodeficha'] = tagtext(marker.findNext('p'))
        scrapeddata['nodocumento'] = tagtext(
            soup.find(hastext('b', '(?m)No.\W+Documento')).findNext('p'))
        scrapeddata['nombredelasociedad'] = tagtext(
            soup.find(hastext(
                'b', '(?m)Nombre de la Sociedad')).findNext('table')).strip()
        #scrapeddata['Tomo'] = smallHead(soup, 'Tomo:')
        #scrapeddata['Folio'] = smallHead(soup, 'Folio:')
        #scrapeddata['Asiento'] = smallHead(soup, 'Asiento:')
        scrapeddata['registerdate'] = self.smallHead(soup,
                                                     'Fecha de Registro:')
        scrapeddata['agent'] = self.smallHead(soup, 'Agente Residente')
        headings = (
            'Fecha de Registro',
            'Status',
            'No. de Escritura',
            'Notaria',  #may be odd
            'Provincia Notaria',
            'Duración',
            'Domicilio',
            #'Status de la Prenda',
        )
        for heading in headings:
            scrapeddata[heading] = self.smallHead(soup, heading)
        scrapeddata['status_de_la_prenda'] = self.wideHead(soup, 'Prenda')
        scrapeddata['titles'] = self.dictOfTitles(soup)
        scrapeddata['directors'] = self.listOfDirectors(soup)
        scrapeddata['suscriptores'] = self.listFromTable(
            soup, 'Nombre de los Suscriptores')

        # we don't really care about the boilerplate explanations of share
        # distribution
        # scrapeddata['capital'] = self.listFromTable(soup, 'Capital')
        # scrapeddata['representantelegal'] = tagtext(soup.find(hastext('font', '(?m)Representante Legal')).findNext('table')).strip()
        for item in scrapeddata:
            if not (isinstance(scrapeddata[item], unicode)):
                print(scrapeddata[item])
        return scrapeddata
Exemple #6
0
 def listFromTable(self, soup, term):
     table = soup.find(hastext('font', term)).findNext('table')
     directors = []
     for row in table.findAll('tr'):
         text = tagtext(row).strip()
         if text:
             directors.append(text)
     return directors
 def listFromTable(self, soup, term):
     table = soup.find(hastext('font', term)).findNext('table')
     directors = []
     for row in table.findAll('tr'):
         text = tagtext(row).strip()
         if text:
             directors.append(text)
     return directors
    def scrapeData(self, rawtext):
        """
        Does the gritty data extraction from HTML
        returns a dictionary of what it found
        """
        scrapeddata = {}
        pagetext = '<html>' + rawtext[rawtext.find('output.html - end'):]
        soup = BeautifulSoup(pagetext)
        marker = soup.find(hastext('b', '(?m)No.\W+de\W+Ficha')) or nosoup
        scrapeddata['nodeficha'] = tagtext(marker.findNext('p'))
        scrapeddata['nodocumento'] = tagtext(soup.find(hastext('b', '(?m)No.\W+Documento')).findNext('p'))
        scrapeddata['nombredelasociedad'] = tagtext(soup.find(hastext('b', '(?m)Nombre de la Sociedad')).findNext('table')).strip()
        #scrapeddata['Tomo'] = smallHead(soup, 'Tomo:')
        #scrapeddata['Folio'] = smallHead(soup, 'Folio:')
        #scrapeddata['Asiento'] = smallHead(soup, 'Asiento:')
        scrapeddata['registerdate'] = self.smallHead(soup, 'Fecha de Registro:')
        scrapeddata['agent'] = self.smallHead(soup, 'Agente Residente')
        headings = (
            'Fecha de Registro', 
            'Status', 
            'No. de Escritura', 
            'Notaria', #may be odd
            'Provincia Notaria',
            'Duración',
            'Domicilio',
            #'Status de la Prenda',
            )
        for heading in headings:
            scrapeddata[heading] = self.smallHead(soup, heading)
        scrapeddata['status_de_la_prenda'] = self.wideHead(soup,
                'Prenda')
        scrapeddata['titles'] = self.dictOfTitles(soup)
        scrapeddata['directors'] = self.listOfDirectors(soup)
        scrapeddata['suscriptores'] = self.listFromTable(soup, 'Nombre de los Suscriptores')

        # we don't really care about the boilerplate explanations of share
        # distribution
        # scrapeddata['capital'] = self.listFromTable(soup, 'Capital')
        # scrapeddata['representantelegal'] = tagtext(soup.find(hastext('font', '(?m)Representante Legal')).findNext('table')).strip()
        for item in scrapeddata:
            if not(isinstance(scrapeddata[item], unicode)):
                print(scrapeddata[item])
        return scrapeddata
Exemple #9
0
 def wideHead(self, soup, term):
     marker = soup.find(hastext('font', term)) or nosoup
     value = tagtext(marker.findNext('td', width='49%')).strip()
     return value or BADDATA
Exemple #10
0
 def wideHead(self, soup, term):
     marker = soup.find(hastext('font', term)) or nosoup
     value = tagtext(marker.findNext('td', width='49%')).strip()
     return value or BADDATA