Python GiantSpider Exemples, octoCrawler.classes.GiantSpider.GiantSpider Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : trt.py Projet : cezardasilva/octo-crawler

class TrtSpider(CrawlSpider):
    name = "trt"
    start_urls = ["http://aplicacao4.tst.jus.br/consultaProcessual/empregadorForm.do?nomeParte=TELEFONICA+BRASIL+S.A.+&stCheckBox=on&consulta=Consultar"]

    rules = (
        # Extract links matching 'category.php' (but not matching 'subsection.php')
        # and follow links from them (since no callback means follow=True by default).
        Rule(LinkExtractor(restrict_xpaths=('/html/body/table/tr/td/form/span[2]/a[text()="Proxima"]', ), deny=())),

        # Extract links matching 'item.php' and parse them with the spider's method parse_item
        Rule(LinkExtractor(restrict_xpaths=('//*[@id="processo"]/tbody/tr/td/table/tr/td/a', )), callback='parse_page'),
    )

    def __init__(self, razao_social="TELEFONICA BRASIL S.A.", arquivados=False):
        super(TrtSpider, self).__init__()
        self.gs = GiantSpider()       
        self.razaoSocial = razao_social
        self.arquivados = arquivados              
        

    def parse_page(self, response):
        processo = ProcessoItem()
        processo["processo"] = []
        processo["acompanhamento"] = []

        #Pegar cabecalho do TRT
        for i in range(4, len(response.xpath("/html/table/tr"))):
            restoProcesso = {}
            descricao = response.xpath("/html/table/tr[" + str(i) + "]/td/b/text()").extract();
            valor = response.xpath("/html/table/tr[" + str(i) + "]/td[1]/text()").extract()
            if descricao:
                restoProcesso["descricao"] = descricao[0].strip(' \n\t')
                if valor:                    
                    valor2 = response.xpath("/html/table/tr[" + str(i) + "]/td[2]/font/text()").extract()
                    if valor2: 
                        valorFinal = valor[0].strip('\r\n\t') + valor2[0].strip('\r\n\t')
                    else:
                        valorFinal = valor[0].strip('\r\n\t')
                    restoProcesso["valor"] = valorFinal
            if restoProcesso:                
                processo["processo"].append(restoProcesso)

        #Pegar o Acompanhamento Processual
        for i in range(3, len(response.xpath("/html/table/tr/td/table/tr"))):
            acompanhamento = {}
            data = response.xpath("/html/table/tr/td/table/tr["+str(i)+"]/td[1]/font/text()").extract()
            descricao = response.xpath("/html/table/tr/td/table/tr["+str(i)+"]/td[2]/table/tr/td/font/text()").extract()      
            if not descricao:
                descricao = response.xpath("/html/table/tr/td/table/tr["+str(i)+"]/td[2]/table/tr/td/font/a/text()").extract()
                pass
            acompanhamento["data"] = data[0].strip('\r\n\t')
            acompanhamento["descricao"] = descricao[0].strip('\r\n\t')
            processo["acompanhamento"].append(acompanhamento)

        self.gs.saveItem(processo, "processo_trt");

Exemple #2

0

Afficher le fichier

Fichier : certidao_negativa_debito.py Projet : cezardasilva/octo-crawler

class CndSpider(scrapy.Spider):
    name = "cnd"
    url = "http://cnd.dataprev.gov.br/cws/contexto/cnd/cnd.html"
    start_urls = ["http://cnd.dataprev.gov.br/cws/contexto/cnd/cnd.html"]

    def __init__(self, cnpj="02558157000162"):
        self.gs = GiantSpider()
        self.driver = webdriver.Firefox()
        self.driver.get(self.url)             
        self.cnpj = cnpj

    def parse(self, response):        
        frame = self.driver.find_element_by_name("CORPO")
        self.driver.switch_to.frame(frame)

        cnpjInput = self.driver.find_element_by_name("num")
        cnpjInput.send_keys(self.cnpj)
        cnpjSelect = self.driver.find_element_by_xpath("/html/body/center/form/font/p[1]/table/tbody/tr[1]/td/font/input[1]")
        cnpjSelect.click()
        continuarInput = self.driver.find_element_by_xpath("/html/body/center/form/font/p[1]/table/tbody/tr[2]/td[2]/input[1]")
        continuarInput.click()

        self.html = self.driver.page_source

        self.driver.quit()
        return self.scraping()

    def scraping(self):
        certidao = {}
        cnd = CndItem()
        cnd["certidoes"] = []
        cnd["cnpj"] = self.cnpj
        certidaoTotal = Selector(text=self.html).xpath('/html/body/div[2]/table[1]/tbody/tr').extract()        

        for i in range(2, (len(certidaoTotal) + 1)):
            numero = Selector(text=self.html).xpath('/html/body/div[2]/table[1]/tbody/tr['+str(i)+']/td[1]/font/a/text()').extract()
            if numero:
                certidao["numero"] = numero[0]
            else:
                certidao["numero"] = Selector(text=self.html).xpath('/html/body/div[2]/table[1]/tbody/tr['+str(i)+']/td[1]/font/text()').extract()[0]
            certidao["data_emissao"] = Selector(text=self.html).xpath('/html/body/div[2]/table[1]/tbody/tr['+str(i)+']/td[2]/font/text()').extract()[0]
            certidao["fin"] = Selector(text=self.html).xpath('/html/body/div[2]/table[1]/tbody/tr['+str(i)+']/td[3]/font/text()').extract()[0]
            certidao["data_validade"] = Selector(text=self.html).xpath('/html/body/div[2]/table[1]/tbody/tr['+str(i)+']/td[4]/font/text()').extract()[0]
            certidao["data_cancelamento"] = Selector(text=self.html).xpath('/html/body/div[2]/table[1]/tbody/tr['+str(i)+']/td[5]/font/text()').extract()[0]
            certidao["hora_brasilia"] = Selector(text=self.html).xpath('/html/body/div[2]/table[1]/tbody/tr['+str(i)+']/td[6]/font/text()').extract()[0]
            cnd["certidoes"].append(certidao)
            
        captcha = self.gs.saveItem(cnd, self.name)

Exemple #3

0

Afficher le fichier

Fichier : fidics.py Projet : cezardasilva/octo-crawler

 def __init__(self):
     self.driver = webdriver.Firefox()
     self.gs = GiantSpider()

Exemple #4

0

Afficher le fichier

Fichier : fidics.py Projet : cezardasilva/octo-crawler

class FidicsSpider(scrapy.Spider):
    name = "Fidcs"
    allowed_domains = ["cvmweb.cvm.gov.br"]
    start_urls = [
        "http://cvmweb.cvm.gov.br/SWB/Sistemas/SCW/CPublica/CConsolFdo/FormBuscaParticFdo.aspx?TP_FIDC=IN489&PK_PARTIC="
    ]

    fidcs_urls = [
        106297,
        111771,
        118537,
        96308,
        95803,
        139408,
        85553,
        109689,
        89969,
        103021,
        104764,
        98190,
        84006,
        120039,
        134269,
        98953,
        128391,
        132491,
        93608,
        138297,
        94716,
        115390,
        117665,
        98500,
        96762,
        87968,
        107825,
        131264,
        89162,
        119507,
        132220,
        121671,
        110665,
        134156,
        134146,
        96788,
        97160,
        43464,
        111159,
        63632,
        64284,
        116793,
        140483,
        108021,
        130309,
        102748,
        118662,
        117692,
        107155,
        130356,
        59871,
        138454,
        106736,
        127391,
        131978,
        97883,
        132271,
        133538,
        96800,
        104765,
        75957,
        96418,
        68016,
        115627,
        138111,
        100209,
        123305,
        132747,
        111781,
        74763,
        109968,
        133599,
        69045,
        96467,
        135917,
        108432,
        75974,
        106444,
        115933,
        98730,
        95639,
        61273,
        112845,
        95487,
        92133,
        89288,
        72110,
        118868,
        118529,
        57886,
        77890,
        100360,
        118669,
        88841,
        85729,
        95735,
        46877,
        84975,
        133895,
        67292,
        75163,
        60945,
        101793,
        73687,
        71204,
        125040,
        107408,
        89651,
        70974,
        73529,
        44682,
        141172,
        103906,
        132696,
        127719,
        111163,
        115859,
        43438,
        135009,
        100796,
        132466,
        137689,
        135924,
        110696,
        133848,
        130840,
        107455,
        71286,
        128221,
        141107,
        134925,
        116364,
        114793,
        134786,
        126550,
        141219,
        105679,
        107436,
        120200,
        139271,
        123652,
        138914,
        117802,
        130985,
        108290,
        134497,
        119531,
        125017,
        129742,
        120840,
        133698,
        117735,
        127679,
        101264,
        82075,
        135789,
        127580,
        115504,
        137352,
        131210,
        112864,
        124176,
        112064,
        79282,
        81777,
        82207,
        60923,
        60942,
        120051,
        98195,
        99336,
        107138,
        74931,
        74788,
        79044,
        104904,
        82717,
        104417,
        126315,
        126787,
        138007,
        107509,
        89282,
        120168,
        123650,
        135681,
        136414,
        119178,
        135067,
        128810,
        138594,
        131861,
        133341,
        97366,
        126454,
        116594,
        132156,
        127578,
        116215,
        83390,
        69221,
        133055,
        111292,
        103655,
        131274,
        127236,
        118891,
        109273,
        140818,
        96758,
        134695,
        60962,
        43088,
        58756,
        91018,
        93467,
        117410,
        109842,
        137255,
        110392,
        96768,
        136006,
        130306,
        102186,
        106257,
        58905,
        104871,
        106128,
        118966,
        134824,
        111779,
        57051,
        105331,
        107566,
        109733,
        131411,
        119672,
        94086,
        102183,
        106284,
        92198,
        76542,
        130814,
        42209,
        61254,
        95722,
        112689,
        82410,
        107290,
        70133,
        66077,
        91549,
        137184,
        133981,
        92126,
        138314,
        118842,
        111948,
        127392,
        124068,
        104050,
        94720,
        133863,
        95057,
        84085,
        131118,
        136654,
        110852,
        70810,
        130314,
        95509,
        139718,
        106336,
        89032,
        91639,
        98598,
        137597,
        96211,
        134870,
        106282,
        129168,
        94827,
        60943,
        135244,
        131447,
        77925,
        132164,
        108243,
        81031,
        139111,
        131120,
        105788,
        88194,
        130984,
        94599,
        135379,
        123353,
        102513,
        132851,
        135264,
        87965,
        77984,
        130810,
        91526,
        124576,
        99644,
        120060,
        130330,
        103022,
        115907,
        116617,
        128778,
        115860,
        138008,
        118812,
        102754,
        131762,
        141589,
        105723,
        88626,
        111441,
        129987,
        119909,
        134224,
        111294,
        77247,
        137531,
        94108,
        119921,
        137686,
        124986,
        126700,
        126553,
        134548,
        90442,
        90178,
        129686,
        138153,
        129581,
        101335,
        103749,
        141679,
        125957,
        95717,
        141580,
        133333,
        135226,
        138639,
        87814,
        100379,
        98724,
        119500,
        90509,
        84825,
        76540,
        80872,
        81798,
        84764,
        80339,
        77775,
        76511,
        76338,
        90500,
        92733,
        109361,
        116122,
        99471,
        99472,
        90639,
        138089,
        136026,
        137540,
        120180,
        120175,
        138868,
        126334,
        138867,
        126084,
        138286,
        133609,
        138863,
        126537,
        133050,
        122591,
        135544,
        139147,
        138864,
        77926,
        126622,
        125195,
        125903,
        123785,
        126538,
        121964,
        141720,
        90144,
        127389,
        140493,
        130117,
        92197,
        129780,
        110480,
        138862,
        124621,
        117732,
        96861,
        129476,
        124064,
        102089,
        115975,
        129928,
        105791,
        137998,
        130564,
        123637,
        126732,
        137539,
        86733,
        76210,
        76331,
        110382,
        138479,
        121440,
        132718,
        126158,
        92341,
        130811,
        139173,
        137535,
        136112,
        77480,
        103713,
        126749,
        126688,
        132027,
        140115,
        134203,
        123275,
        125659,
        83443,
        116723,
        134744,
        133045,
        112088,
        127008,
        138865,
        126445,
        119340,
        123853,
        120305,
        91019,
        134871,
        98140,
        127707,
        115031,
        140677,
        99796,
        134976,
        138504,
        73286,
        73287,
        119706,
        136590,
        116467,
        89691,
        132270,
        76341,
        123859,
        117764,
        135975,
        132267,
    ]

    def __init__(self):
        self.driver = webdriver.Firefox()
        self.gs = GiantSpider()

    def parse(self, response):
        for fidc in self.fidcs_urls:
            try:
                self.execute(response, fidc)
            except:
                pass
        self.driver.quit()

    def execute(self, response, fidc):
        self.driver.get(response.url + ` fidc `)
        captcha = self.gs.decodeCaptchaBypass("106297", self.driver, "fidc", (996, 266, 1135, 305))

        if captcha:
            captchaInput = self.driver.find_element_by_xpath("//*[@id='numRandom']")
            captchaInput.send_keys(captcha)
            continuarInput = self.driver.find_element_by_xpath("//*[@id='btnContinuar']")
            continuarInput.click()
            error_element = self.driver.find_elements_by_xpath("//*[@id='lblMsg']")
            if error_element:
                self.execute(response, fidc)
            self.html = self.driver.page_source
            _id = self.scrapingHeader()
            self.scrapingInformeMensal(_id)
        else:
            print "Captcha invalido!"

    def scrapingHeader(self):
        data = []
        fidc = FidcItem()
        fidc["administradora"] = {}
        fidc["diretor"] = {}
        fidc["gestora"] = {}
        fidc["gestora"]["diretor"] = {}
        _id = None
        for sel in Selector(text=self.html).xpath('//*[@id="tabAtivos"]/tbody/tr/td/span'):
            result = sel.xpath("text()").extract()
            if len(result) > 0:
                data.append(result)
            else:
                result = []
                result.append(" ")
                data.append(result)

        if data:
            fidc["denominacao"] = data[1][0]
            fidc["cnpj"] = data[2][0]
            fidc["administradora"]["denominacao"] = data[3][0]
            fidc["administradora"]["cnpj"] = data[4][0]
            fidc["diretor"]["nome"] = data[5][0]
            fidc["diretor"]["cpf"] = data[6]
            fidc["telefone"] = data[7][0]
            fidc["fax"] = data[8][0]
            fidc["email"] = data[9]
            fidc["endereco"] = data[10][0]
            fidc["gestora"]["denominacao"] = data[11][0]
            fidc["gestora"]["cnpj"] = data[12][0]
            fidc["gestora"]["diretor"]["responsavel"] = data[13][0]
            fidc["gestora"]["diretor"]["cpf"] = data[14]
            fidc["gestora"]["telefone"] = data[15][0]
            fidc["gestora"]["fax"] = data[16][0]
            fidc["gestora"]["email"] = data[17]
            if len(data) > 18:
                fidc["gestora"]["endereco"] = data[18][0]
            _id = self.gs.saveItem(fidc, self.name)
            pass
        return _id

    def scrapingInformeMensal(self, _id):
        btn = self.driver.find_element_by_xpath("//*[@id='Hyperlink4']")
        btn.click()
        self.html = self.driver.page_source
        informe_mensal = FidcInformeMensalItem()
        informe_mensal["informe"] = {}
        informe_mensal["cedentes"] = {}

        competencia = Selector(text=self.html).xpath("//*[@id='ddlComptc']/option").extract()
        if competencia:
            for i in range(1, (len(competencia) + 1)):
                btn = self.driver.find_element_by_xpath("//*[@id='ddlComptc']/option[" + str(i) + "]")
                btn.click()
                self.html = self.driver.page_source
                informe = []

                elements_informe = Selector(text=self.html).xpath("//tr[contains(@style, 'background-color:#FAEFCA;')]")
                for k in range(1, (len(elements_informe))):
                    tmpInforme = {}
                    titulo_descricao = elements_informe[k].xpath("td[1]/span/b/text()").extract()
                    titulo_valor = elements_informe[k].xpath("td[2]/span/text()").extract()
                    descricao = elements_informe[k].xpath("td[1]/span/text()").extract()
                    valor = elements_informe[k].xpath("td[2]/span/text()").extract()
                    if len(titulo_descricao) > 0 and len(titulo_valor) > 0:
                        tmpInforme["descricao"] = titulo_descricao[0].strip(" \r\n\t").strip()
                        tmpInforme["valor"] = titulo_valor[0].strip(" \r\n\t").strip()
                    else:
                        if len(descricao) > 0 and len(valor) > 0:
                            tmpInforme["descricao"] = descricao[0].strip(" \r\n\t").strip()
                            tmpInforme["valor"] = valor[0].strip(" \r\n\t").strip()

                    if tmpInforme:
                        informe.append(tmpInforme)

                cedentes = Selector(text=self.html).xpath("//*[@id='Table6']/tbody/tr")

                cedente = {}

                for j in range(1, (len(cedentes))):
                    cnpj = cedentes[j].xpath("td[2]/span/text()").extract()[0].strip()
                    if cnpj and cnpj <> "0":
                        cedente[cnpj] = cedentes[j].xpath("td[3]/span/text()").extract()[0].strip()

                informe_mensal["_fidic_id"] = _id
                informe_mensal["competencia"] = (
                    Selector(text=self.html).xpath("//*[@id='ddlComptc']/option[" + str(i) + "]/text()").extract()[0]
                )
                informe_mensal["informe"] = informe
                if cedente:
                    informe_mensal["cedentes"] = cedente
                else:
                    informe_mensal["cedentes"] = "Nenhum cedente"
                _informe = self.gs.saveItem(informe_mensal, "fidc_informe_mensal")

Exemple #5

0

Afficher le fichier

Fichier : certidao_negativa_debito.py Projet : cezardasilva/octo-crawler

 def __init__(self, cnpj="02558157000162"):
     self.gs = GiantSpider()
     self.driver = webdriver.Firefox()
     self.driver.get(self.url)             
     self.cnpj = cnpj

Exemple #6

0

Afficher le fichier

Fichier : trt.py Projet : cezardasilva/octo-crawler

 def __init__(self, razao_social="TELEFONICA BRASIL S.A.", arquivados=False):
     super(TrtSpider, self).__init__()
     self.gs = GiantSpider()       
     self.razaoSocial = razao_social
     self.arquivados = arquivados

Exemple #7

0

Afficher le fichier

Fichier : receita_federal.py Projet : cezardasilva/octo-crawler

class ReceitaFederalSpider(scrapy.Spider):
    name = "ReceitaFederal"
    url = "http://www.receita.fazenda.gov.br/pessoajuridica/cnpj/cnpjreva/valida.asp"
    start_urls = ["http://www.receita.fazenda.gov.br/pessoajuridica/cnpj/cnpjreva/valida.asp"]

    def __init__(self, cnpj="21101794000150"):
        self.gs = GiantSpider()
        self.driver = webdriver.Firefox()                   
        self.cnpj = cnpj

    def parse(self, response):         
        self.driver.get(self.url)  
        
        self.fillForm()        

        self.html = self.driver.page_source

        receita = self.scraping()

        qsaButton = self.driver.find_element_by_name("qsa")
        qsaButton.click()

        self.html = self.driver.page_source 
        self.driver.close()       
        self.driver.quit()
        receita = self.scrapingQSA(receita)
        self.gs.saveItem(receita, self.name)
        self.gs.updateFile(self.cnpj);



    def fillForm(self):
        captcha = self.gs.decodeCaptchaBypass(self.cnpj, self.driver, self.name, (182,150,363,199))

        cnpjInput = self.driver.find_element_by_xpath("//*[@id='cnpj']")
        cnpjInput.send_keys(self.cnpj)

        captchaInput = self.driver.find_element_by_xpath("//*[@id='txtTexto_captcha_serpro_gov_br']")
        captchaInput.send_keys(captcha)

        continuarInput = self.driver.find_element_by_xpath("//*[@id='submit1']")
        continuarInput.click()

        error = self.driver.find_elements_by_xpath("//*[@id='theForm']/font/font/table/tbody/tr[2]/td/font/b");
        if len(error) > 0:
            self.fillForm()

    def scraping(self):
        receita = ReceitaItem()
        receita['endereco'] = {}
        receita['contato'] = {}
        receita['cadastral'] = {}

        pre_xpath = "/html/body/table[2]/tbody/tr/td/"
        cnpjValido = Selector(text=self.html).xpath(pre_xpath + 'table[2]/tbody/tr/td[1]/font[2]/b[1]/text()');
        if cnpjValido:
            cnpj = Selector(text=self.html).xpath(pre_xpath + 'table[2]/tbody/tr/td[1]/font[2]/b[1]/text()').extract()[0].strip(' \r\n\t')
            receita['cnpj'] = re.sub('[./-]', '', cnpj)
            receita['data_constituicao'] = Selector(text=self.html).xpath(pre_xpath + '/table[2]/tbody/tr/td[3]/font/b/text()').extract()[0].strip(' \r\n\t') 
            receita['razao_social'] = Selector(text=self.html).xpath(pre_xpath + 'table[3]/tbody/tr/td/font[2]/b/text()').extract()[0].strip(' \r\n\t') 
            receita['nome_fantasia'] = Selector(text=self.html).xpath(pre_xpath + 'table[4]/tbody/tr/td/font[2]/b/text()').extract()[0].strip(' \r\n\t')

            atividade_primaria = Selector(text=self.html).xpath(pre_xpath + 'table[5]/tbody/tr/td/font[2]/b').extract()
            
            #receita['atividade_economica_primaria'] = Selector(text=self.html).xpath(pre_xpath + 'table[2]/tbody/tr/td[1]/font[2]/b[1]/text()').extract()[0] 
            #receita['atividade_economica_secundaria'] = Selector(text=self.html).xpath(pre_xpath + 'table[2]/tbody/tr/td[1]/font[2]/b[1]/text()').extract()[0] 


            receita['natureza_juridica'] = Selector(text=self.html).xpath(pre_xpath + 'table[7]/tbody/tr/td/font[2]/b/text()').extract()[0].strip(' \r\n\t') 
            #Preencher o Endereco
            receita['endereco']['logradouro'] = Selector(text=self.html).xpath(pre_xpath + 'table[8]/tbody/tr/td[1]/font[2]/b/text()').extract()[0].strip(' \r\n\t') 
            receita["endereco"]["numero"] = Selector(text=self.html).xpath(pre_xpath + 'table[8]/tbody/tr/td[3]/font[2]/b/text()').extract()[0].strip(' \r\n\t')
            receita["endereco"]["complemento"] = Selector(text=self.html).xpath(pre_xpath + 'table[8]/tbody/tr/td[5]/font[2]/b/text()').extract()[0].strip(' \r\n\t') 
            receita["endereco"]["bairro"] = Selector(text=self.html).xpath(pre_xpath + 'table[9]/tbody/tr/td[3]/font[2]/b/text()').extract()[0].strip(' \r\n\t') 
            receita["endereco"]["cidade"] = Selector(text=self.html).xpath(pre_xpath + 'table[9]/tbody/tr/td[5]/font[2]/b/text()').extract()[0].strip(' \r\n\t') 
            receita["endereco"]["uf"] = Selector(text=self.html).xpath(pre_xpath + 'table[9]/tbody/tr/td[7]/font[2]/b/text()').extract()[0].strip(' \r\n\t')
            receita["endereco"]["cep"] = Selector(text=self.html).xpath(pre_xpath + 'table[9]/tbody/tr/td[1]/font[2]/b/text()').extract()[0].strip(' \r\n\t') 
            #Preencher o Contato
            receita['contato']['email'] = Selector(text=self.html).xpath(pre_xpath + 'table[10]/tbody/tr/td[1]/font[2]/b/text()').extract()[0].strip(' \r\n\t') 
            receita['contato']['telefone'] = Selector(text=self.html).xpath(pre_xpath + 'table[10]/tbody/tr/td[1]/font[2]/b/text()').extract()[0].strip(' \r\n\t') 
            receita['contato']['ente_federativo_responsavel'] = Selector(text=self.html).xpath(pre_xpath + 'table[11]/tbody/tr/td/font[2]/b/text()').extract()[0].strip(' \r\n\t') 
            #Preencher os dados Cadastrais
            receita['cadastral']['situacao'] = Selector(text=self.html).xpath(pre_xpath + 'table[12]/tbody/tr/td[1]/font[2]/b/text()').extract()[0].strip(' \r\n\t') 
            receita['cadastral']['data'] = Selector(text=self.html).xpath(pre_xpath + 'table[12]/tbody/tr/td[3]/font[2]/b/text()').extract()[0].strip(' \r\n\t') 
            receita['cadastral']['motivo'] = Selector(text=self.html).xpath(pre_xpath + 'table[13]/tbody/tr/td/font[2]/b/text()').extract()[0].strip(' \r\n\t')
            receita['cadastral']['situacao_especial'] = Selector(text=self.html).xpath(pre_xpath + 'table[14]/tbody/tr/td[1]/font[2]/b/text()').extract()[0].strip(' \r\n\t')
            receita['cadastral']['data_especial'] = Selector(text=self.html).xpath(pre_xpath + 'table[14]/tbody/tr/td[3]/font[2]/b/text()').extract()[0].strip(' \r\n\t') 
            #Preencher evidencia do cartao CNPJ
            receita["html_cartao_cnpj"] = self.html
        return receita

    def scrapingQSA(self, receita):
        #Preencher QSA
        receita['qsa'] = {}
        capital_social = Selector(text=self.html).xpath('/html/body/table[2]/tbody/tr/td/table/tbody/tr[3]/td[2]/text()')
        if capital_social:
            receita['qsa']['capital_social'] = capital_social.extract()[0].strip(' \r\n\t')
        else:
            receita['qsa']['capital_social'] = "NAO PREENCHIDO"

        qsa = Selector(text=self.html).xpath('/html/body/table[3]/tbody/tr/td/table[3]/tbody/tr')
        if qsa:
            receita['qsa']['quadro_social'] = []
            quadros = Selector(text=self.html).xpath('/html/body/table[3]/tbody/tr/td/table[3]/tbody/tr')
            for k in range(1, (len(quadros))):
                tmpQuadro = {}
                nome_empresarial = Selector(text=self.html).xpath('/html/body/table[3]/tbody/tr/td/table[3]/tbody/tr['+str(k)+']/td/fieldset/table/tbody/tr/td[1]/table/tbody/tr[1]/td[2]/text()').extract()
                qualificacao = Selector(text=self.html).xpath('/html/body/table[3]/tbody/tr/td/table[3]/tbody/tr['+str(k)+']/td/fieldset/table/tbody/tr/td[1]/table/tbody/tr[2]/td[2]/text()').extract()
                if len(nome_empresarial) > 0:
                    tmpQuadro["nome_empresarial"] = nome_empresarial[0].strip(' \r\n\t')
                    if len(qualificacao) > 0:
                        tmpQuadro["qualificacao"] = qualificacao[0].strip(' \r\n\t')
                        receita['qsa']['quadro_social'].append(tmpQuadro)

                
        else:
            receita['qsa']['quadro_social'] = "A NATUREZA JURIDICA NAO PERMITE O PREENCHIMENTO DO QSA";
            
        return receita

Exemple #8

0

Afficher le fichier

Fichier : receita_federal.py Projet : cezardasilva/octo-crawler

 def __init__(self, cnpj="21101794000150"):
     self.gs = GiantSpider()
     self.driver = webdriver.Firefox()                   
     self.cnpj = cnpj