Python simplify_stringの例、main.utils.simplify_string Pythonの例

コード例 #1

0

ファイルを表示

ファイル: scraper.py プロジェクト: hugotox/causas

    def scrape_civil_escrito(self, causa_id, causa_obj, soup):
        """Searches for escritos pendientes in civil causa"""
        rows = soup.find(id='titTablaCivEsc').parent.parent.find_all(
            'tr'
        )  # double parent because this one has header tr inside <thead>
        header = True
        for row in rows:
            if not header:  # skip the header row
                tds = row.find_all('td')
                input_el = row.find('input', attrs={'name': 'id_documento'})
                id_documento = input_el.attrs['value']
                if id_documento:
                    doc_id = '{}__{}'.format(
                        causa_id, id_documento)  # id = causa_id__id_documento
                    created = False
                    try:
                        doc_obj = EscritoCivilPorResolver.objects.get(
                            id=doc_id)
                    except:
                        doc_obj = EscritoCivilPorResolver(
                            id=doc_id,
                            causa=causa_obj,
                            fecha=simplify_string(tds[2].contents[0]),
                            tipo=simplify_string(tds[3].contents[0]),
                            solicitante=simplify_string(tds[4].contents[0]),
                        )
                        doc_obj.save()
                        created = True

                    if created:
                        if self.profile.initial_migration_done:
                            print('Sending notification: {}'.format(doc_obj))
                            send_new_doc_notification(doc_obj)

            else:
                header = False

コード例 #2

0

ファイルを表示

ファイル: scraper.py プロジェクト: hugotox/causas

    def scrape_familia_document(self, causa):
        """Opens the detail of a causa. `causa` is the soup form element"""
        session = self.session
        url = Scraper.CAUSA_TYPES['familia']['detail']
        data = {}
        for input_elm in causa.find_all('input'):
            if 'name' in input_elm.attrs.keys(
            ) and 'value' in input_elm.attrs.keys():
                data[input_elm.attrs['name']] = input_elm.attrs['value']
        causa_id = 'FAM_{}_{}_{}'.format(data['tipo_causa'], data['rol_causa'],
                                         data['era_causa'])
        tr = causa.parent.parent
        tds = tr.find_all('td')
        caratulado = simplify_string(tds[3].contents[0])
        try:
            causa_obj = Causa.objects.get(id=causa_id)
        except Exception as ex:
            causa_obj = Causa(id=causa_id,
                              user=self.profile,
                              type=Causa.TYPE_CHOICES_FAMILIA,
                              archived=False,
                              rol='{}-{}-{}'.format(data['tipo_causa'],
                                                    data['rol_causa'],
                                                    data['era_causa']),
                              caratulado=caratulado)
            causa_obj.save()
            if self.profile.initial_migration_done:
                send_new_causa_notification(causa_obj)

        if causa_obj and causa_id:
            # print(causa_obj)
            # Open causa details:
            resp = session.post(url,
                                data=data,
                                headers=Scraper.SCRAPER_HEADERS)
            if 'Causa Familia' in resp.text:
                resp_text = resp.content.decode('ISO-8859-1').replace(
                    '\r', ' ').replace('\n', '')
                html = '<html><body>{}</body></html>'.format(resp_text)
                soup = BeautifulSoup(html, 'html.parser')
                rows = soup.find(id='titTablaFam').parent.find_all('tr')
                header = True
                for row in rows:
                    if not header:  # skip the header row
                        link = row.find('a')
                        if link and 'onclick' in link.attrs:
                            onclick = link.attrs[
                                'onclick']  # E.g "vvbbFF('Resolución',3,59987979,0);"
                            doc_id = onclick.replace('vvbbFF(', '').replace(
                                ');', '').replace(',', '-').replace("'", '')
                            tds = row.find_all('td')
                            if doc_id:
                                created = False
                                try:
                                    doc_obj = DocFamilia.objects.get(id=doc_id)
                                except:
                                    doc_obj = DocFamilia(
                                        id=doc_id,
                                        causa=causa_obj,
                                        etapa=simplify_string(
                                            tds[2].contents[0]),
                                        tramite=simplify_string(
                                            tds[3].contents[0]),
                                        desc_tramite=simplify_string(
                                            tds[4].contents[0]),
                                        referencia=simplify_string(
                                            tds[5].contents[0]),
                                        fecha=simplify_string(
                                            tds[6].contents[0]),
                                    )
                                    doc_obj.save()
                                    created = True

                                if created:
                                    if self.profile.initial_migration_done:
                                        print(
                                            'Sending notification: {}'.format(
                                                doc_obj))
                                        send_new_doc_notification(doc_obj)

                                # print(doc_obj)

                    else:
                        header = False

コード例 #3

0

ファイルを表示

ファイル: scraper.py プロジェクト: hugotox/causas

    def scrape_cobranza_document(self, causa):
        """Opens the detail of a causa. `causa` is the soup form element"""
        session = self.session
        url = Scraper.CAUSA_TYPES['cobranza']['detail']
        data = {}
        for input_elm in causa.find_all('input'):
            if 'name' in input_elm.attrs.keys(
            ) and 'value' in input_elm.attrs.keys():
                data[input_elm.attrs['name']] = input_elm.attrs['value']

        causa_id = 'COB_{}_{}_{}'.format(data['tipo_causa'], data['rol_causa'],
                                         data['era_causa'])
        tr = causa.parent.parent
        tds = tr.find_all('td')
        caratulado = simplify_string(tds[3].contents[0])
        try:
            causa_obj = Causa.objects.get(id=causa_id)
        except Exception as ex:
            causa_obj = Causa(id=causa_id,
                              user=self.profile,
                              type=Causa.TYPE_CHOICES_COBRANZA,
                              archived=False,
                              rol='{}-{}-{}'.format(data['tipo_causa'],
                                                    data['rol_causa'],
                                                    data['era_causa']),
                              caratulado=caratulado)
            causa_obj.save()
            if self.profile.initial_migration_done:
                send_new_causa_notification(causa_obj)

        if causa_obj and causa_id:
            # print(causa_obj)
            # Open causa details:
            resp = session.post(url,
                                data=data,
                                headers=Scraper.SCRAPER_HEADERS)
            if 'Causa Cobranza' in resp.text:
                resp_text = resp.content.decode('ISO-8859-1').replace(
                    '\r', ' ').replace('\n', '')
                html = '<html><body>{}</body></html>'.format(resp_text)
                soup = BeautifulSoup(html, 'html.parser')
                rows = soup.find(id='titTablaCob').parent.find_all('tr')
                header = True
                for row in rows:
                    if not header:  # skip the header row
                        doc_data = {}
                        for doc_input in row.find_all('input'):
                            try:
                                doc_data[doc_input.attrs[
                                    'name']] = doc_input.attrs['value']
                            except:
                                pass
                        doc_keys = doc_data.keys()
                        doc_id = None
                        if 'cod_tribunal' in doc_keys and 'crr_iddocumento' in doc_keys:
                            doc_id = 'COB-{}-{}'.format(
                                doc_data['cod_tribunal'],
                                doc_data['crr_iddocumento'])
                        tds = row.find_all('td')
                        if doc_id:
                            created = False
                            try:
                                doc_obj = DocCobranza.objects.get(id=doc_id)
                            except:
                                doc_obj = DocCobranza(
                                    id=doc_id,
                                    causa=causa_obj,
                                    etapa=simplify_string(tds[1].contents[0]),
                                    tramite=simplify_string(
                                        tds[2].contents[0]),
                                    desc_tramite=simplify_string(
                                        tds[3].contents[0]),
                                    fecha=simplify_string(tds[4].contents[0]),
                                )
                                doc_obj.save()
                                created = True

                            if created:
                                if self.profile.initial_migration_done:
                                    print('Sending notification: {}'.format(
                                        doc_obj))
                                    send_new_doc_notification(doc_obj)

                            # print(doc_obj)

                    else:
                        header = False

コード例 #4

0

ファイルを表示

ファイル: scraper.py プロジェクト: hugotox/causas

    def scrape_civil_document(self, causa):
        """Opens the detail of a causa. `causa` is the soup form element"""
        session = self.session
        url = Scraper.CAUSA_TYPES['civil']['detail']
        data = {}
        for input_elm in causa.find_all('input'):
            if 'name' in input_elm.attrs.keys(
            ) and 'value' in input_elm.attrs.keys():
                data[input_elm.attrs['name']] = input_elm.attrs['value']
        causa_id = 'CIV_{}_{}_{}'.format(data['tipo'], data['rol'],
                                         data['ano'])
        tr = causa.parent.parent
        tds = tr.find_all('td')
        caratulado = simplify_string(tds[3].contents[0])
        try:
            causa_obj = Causa.objects.get(id=causa_id)
        except Exception as ex:
            causa_obj = Causa(id=causa_id,
                              user=self.profile,
                              type=Causa.TYPE_CHOICES_CIVIL,
                              archived=False,
                              rol='{}-{}-{}'.format(data['tipo'], data['rol'],
                                                    data['ano']),
                              caratulado=caratulado)
            causa_obj.save()
            if self.profile.initial_migration_done:
                send_new_causa_notification(causa_obj)

        if causa_obj and causa_id:
            # print(causa_obj)
            # Open causa details:
            resp = session.post(url,
                                data=data,
                                headers=Scraper.SCRAPER_HEADERS)
            if 'Causa Civil' in resp.text:
                resp_text = resp.content.decode('ISO-8859-1').replace(
                    '\r', ' ').replace('\n', '')
                html = '<html><body>{}</body></html>'.format(resp_text)
                soup = BeautifulSoup(html, 'html.parser')
                self.scrape_civil_escrito(causa_id, causa_obj, soup)
                rows = soup.find(id='titTablaCiv').parent.parent.find_all(
                    'tr'
                )  # double parent becaouse this one has header tr inside <thead>
                header = True
                for row in rows:
                    if not header:  # skip the header row
                        tds = row.find_all('td')
                        if tds[0].contents[0] and tds[0].contents[0].strip(
                        ) != '':
                            doc_id = '{}__{}'.format(causa_id,
                                                     tds[0].contents[0].strip(
                                                     ))  # id = causa_id__folio
                            created = False
                            try:
                                doc_obj = DocCivil.objects.get(id=doc_id)
                            except:
                                tribunal = ''
                                try:
                                    strongs = soup.find_all('strong')
                                    if len(strongs) >= 8:
                                        tribunal = simplify_string(
                                            strongs[7].parent.contents[1])
                                except:
                                    pass
                                doc_obj = DocCivil(
                                    id=doc_id,
                                    causa=causa_obj,
                                    etapa=simplify_string(tds[3].contents[0]),
                                    tramite=simplify_string(
                                        tds[4].contents[0]),
                                    descripcion=simplify_string(
                                        tds[5].contents[0]),
                                    fecha=simplify_string(tds[6].contents[0]),
                                    foja=simplify_string(tds[7].contents[0]),
                                    tribunal=tribunal)
                                doc_obj.save()
                                created = True

                            if created:
                                if self.profile.initial_migration_done:
                                    print('Sending notification: {}'.format(
                                        doc_obj))
                                    send_new_doc_notification(doc_obj)

                            # print(doc_obj)

                    else:
                        header = False

コード例 #5

0

ファイルを表示

ファイル: scraper.py プロジェクト: hugotox/causas

    def scrape_apelaciones_document(self, causa):
        """Opens the detail of a causa. `causa` is the soup form element"""
        session = self.session
        url = Scraper.CAUSA_TYPES['apelaciones']['detail']
        data = {}
        for input_elm in causa.find_all('input'):
            if 'name' in input_elm.attrs.keys(
            ) and 'value' in input_elm.attrs.keys():
                data[input_elm.attrs['name']] = input_elm.attrs['value']
        causa_id = 'APE_{}_{}'.format(data['rol_causa'], data['era_causa'])
        tr = causa.parent.parent
        tds = tr.find_all('td')
        caratulado = simplify_string(tds[3].contents[0])
        try:
            causa_obj = Causa.objects.get(id=causa_id)
        except Exception as ex:
            causa_obj = Causa(id=causa_id,
                              user=self.profile,
                              type=Causa.TYPE_CHOICES_APELACIONES,
                              archived=False,
                              rol='{}-{}'.format(data['rol_causa'],
                                                 data['era_causa']),
                              caratulado=caratulado)
            causa_obj.save()
            if self.profile.initial_migration_done:
                send_new_causa_notification(causa_obj)

        if causa_obj and causa_id:
            # print(causa_obj)
            # Open causa details:
            resp = session.post(url,
                                data=data,
                                headers=Scraper.SCRAPER_HEADERS)
            if 'Recurso Corte de Apelaciones' in resp.text:
                resp_text = resp.content.decode('ISO-8859-1').replace(
                    '\r', ' ').replace('\n', '')
                html = '<html><body>{}</body></html>'.format(resp_text)
                soup = BeautifulSoup(html, 'html.parser')
                rows = soup.find(id='titTablaApeGrid').parent.find_all('tr')
                header = True
                for row in rows:
                    if not header:  # skip the header row
                        tds = row.find_all('td')
                        if tds[2].contents[0] and tds[2].contents[0].strip(
                        ) != '':
                            doc_id = '{}__{}'.format(
                                causa_id, tds[2].contents[0].strip())
                            created = False
                            try:
                                doc_obj = DocApelaciones.objects.get(id=doc_id)
                            except:
                                libro = ''
                                nro_ingreso = ''
                                try:
                                    descPopUps = soup.find_all(
                                        'tr', attrs={'class': 'descPopUp'})
                                    if len(descPopUps) >= 2:
                                        descPopUpData = descPopUps[1].find_all(
                                            'td')
                                        libro = simplify_string(
                                            descPopUpData[0].contents[0])
                                        nro_ingreso = simplify_string(
                                            descPopUpData[1].contents[0])
                                except:
                                    pass
                                doc_obj = DocApelaciones(
                                    id=doc_id,
                                    causa=causa_obj,
                                    tipo=simplify_string(tds[1].contents[0]),
                                    descripcion=simplify_string(
                                        tds[3].contents[0]),
                                    fecha=simplify_string(tds[4].contents[0]),
                                    salas=simplify_string(tds[5].contents[0]),
                                    foja_inicial=simplify_string(
                                        tds[6].contents[0]),
                                    libro=libro,
                                    nro_ingreso=nro_ingreso)
                                doc_obj.save()
                                created = True

                            if created:
                                if self.profile.initial_migration_done:
                                    print('Sending notification: {}'.format(
                                        doc_obj))
                                    send_new_doc_notification(doc_obj)

                            # print(doc_obj)

                    else:
                        header = False

コード例 #6

0

ファイルを表示

ファイル: scraper.py プロジェクト: hugotox/causas

    def scrape_suprema_document(self, causa):
        """Opens the detail of a causa"""
        session = self.session
        url = Scraper.CAUSA_TYPES['suprema']['detail']
        data = {}
        for input_elm in causa.find_all('input'):
            if 'name' in input_elm.attrs.keys(
            ) and 'value' in input_elm.attrs.keys():
                data[input_elm.attrs['name']] = input_elm.attrs['value']
        causa_id = 'SUP_{}_{}'.format(data['rol_causa'], data['era_causa'])
        try:
            causa_obj = Causa.objects.get(id=causa_id)
        except Exception as ex:
            causa_obj = Causa(id=causa_id,
                              user=self.profile,
                              type=Causa.TYPE_CHOICES_SUPREMA,
                              archived=False,
                              rol='{}-{}'.format(data['rol_causa'],
                                                 data['era_causa']),
                              caratulado=data['caratulado'])
            causa_obj.save()
            if self.profile.initial_migration_done:
                send_new_causa_notification(causa_obj)
        if causa_obj and causa_id:
            # print(causa_obj)
            # Open causa details:
            resp = session.post(url,
                                data=data,
                                headers=Scraper.SCRAPER_HEADERS)
            if 'Recurso Corte Suprema' in resp.text:
                resp_text = resp.content.decode('ISO-8859-1').replace(
                    '\r', ' ').replace('\n', '')
                html = '<html><body>{}</body></html>'.format(resp_text)
                soup = BeautifulSoup(html, 'html.parser')
                rows = soup.find(id='titTablaSup').parent.find_all('tr')
                header = True
                for row in rows:
                    if not header:  # skip the header row
                        tds = row.find_all('td')
                        iddoc_input = row.find('input',
                                               attrs={'name': 'iddoc'})
                        if iddoc_input:
                            doc_id = '{}__{}'.format(
                                causa_id, iddoc_input.attrs['value'])
                            created = False
                            try:
                                doc_obj = DocSuprema.objects.get(id=doc_id)
                            except:
                                doc_obj = DocSuprema(
                                    id=doc_id,
                                    causa=causa_obj,
                                    anio=simplify_string(tds[2].contents[0]),
                                    fecha=simplify_string(tds[3].contents[0]),
                                    tipo=simplify_string(tds[4].contents[0]),
                                    nomenclatura=simplify_string(
                                        tds[5].contents[0]),
                                    descripcion=simplify_string(
                                        tds[6].contents[0]),
                                    salas=simplify_string(tds[7].contents[0]))
                                doc_obj.save()
                                created = True

                            if created:
                                if self.profile.initial_migration_done:
                                    print('Sending notification: {}'.format(
                                        doc_obj))
                                    send_new_doc_notification(doc_obj)

                            # print(doc_obj)

                    else:
                        header = False