Ejemplo n.º 1
0
    def update_data_for_month(self, mandate, year, month):
        for data in self.find_data_for_month(mandate, year, month):
            nature = self.get_or_create_expense_nature(
                '{0}: {1}'.format(data['budget_title'], data['budget_subtitle'])
            )

            name = data.get('nome') or 'Sem nome'
            no_identifier = u'Sem CPF/CNPJ ({0})'.format(name)
            cpf_cnpj = self.normalize_cnpj_or_cpf(data.get('cpf_cnpj')) or no_identifier

            try:
                supplier = Supplier.objects.get(identifier=cpf_cnpj)
            except Supplier.DoesNotExist:
                supplier = Supplier(identifier=cpf_cnpj, name=name)
                supplier.save()
            # FIXME
            except Supplier.MultipleObjectsReturned:
                supplier = Supplier.objects.filter(identifier=cpf_cnpj)[1]

            date = datetime.strptime(data['date'], '%d/%m/%Y')
            expense = ArchivedExpense(
                number=data.get('number', ''),
                nature=nature,
                date=date,
                value=data['value_presented'],
                expensed=data['value_expensed'],
                mandate=mandate,
                supplier=supplier,
                collection_run=self.collection_run,
            )

            expense.save()
Ejemplo n.º 2
0
 def get_or_create_supplier(self, identifier, name=None):
     identifier = self.normalize_cnpj_or_cpf(identifier)
     try:
         supplier = Supplier.objects.get(identifier=identifier)
     except Supplier.DoesNotExist:
         supplier = Supplier(identifier=identifier, name=name)
         supplier.save()
         self.debug(u'New supplier found: {0}'.format(unicode(supplier)))
     return supplier
Ejemplo n.º 3
0
 def get_or_create_supplier(self, identifier, name=None):
     identifier = self.normalize_cnpj_or_cpf(identifier)
     try:
         supplier = Supplier.objects.get(identifier=identifier)
     except Supplier.DoesNotExist:
         supplier = Supplier(identifier=identifier, name=name)
         supplier.save()
         self.debug(u'New supplier found: {0}'.format(unicode(supplier)))
     return supplier
Ejemplo n.º 4
0
    def update_data_for_legislator(self, data, code, month, year):
        self.debug("Updating data %s/%s for legislator: %s" % (month, year, code))

        data = data.find('div', {'class': 'row'})

        legislator = data.find('h2').findChildren()[0].next
        legislator = self._normalize_name(legislator)
        legislator, created = Legislator.objects.get_or_create(name=legislator)

        if created:
            self.debug("New legislator: %s" % unicode(legislator))
        else:
            self.debug("Found existing legislator: %s" % unicode(legislator))

        mandate = self.mandate_for_legislator(legislator, party=None, original_id=code)

        natures = data.findAll('h3')
        for data in natures:
            nature, _ = ExpenseNature.objects.get_or_create(name=self._normalize_nature(data.text))
            rows = data.findNext().findAll('tr')[1:-1]
            for row in rows:
                columns = row.findAll('td')

                if not len(columns) == 5:
                    print u'Bad row: %s' % unicode(columns)
                    continue

                cnpj = self.normalize_cnpj_or_cpf(columns[0].getText())

                supplier_name = columns[1].getText().strip()

                try:
                    supplier_name = supplier_name.decode('utf-8')
                except Exception:
                    pass

                try:
                    supplier = Supplier.objects.get(identifier=cnpj)
                except Supplier.DoesNotExist:
                    supplier = Supplier(identifier=cnpj, name=supplier_name)
                    supplier.save()

                docnumber = columns[2].getText()
                expensed = parse_money(columns[3].getText())

                expense = ArchivedExpense(number=docnumber,
                                          nature=nature,
                                          date=date(year, month, 1),
                                          expensed=expensed,
                                          mandate=mandate,
                                          supplier=supplier,
                                          collection_run=self.collection_run)
                expense.save()

                self.debug("New expense found: %s" % unicode(expense))
Ejemplo n.º 5
0
    def _update_data_for_year(self, mandates, year, month):
        for mandate in mandates:
            url = '{0}/?dep={1}&ano={2}&mes={3}'.format(
                TRANSPARENCIA_URL,
                mandate.original_id,
                year,
                month,
            )
            expense_natures = {}
            expenses_data = self.retrieve_uri(url)
            natures = expenses_data.find(id='div-com-verba').findAll('h4')
            for nature in natures:
                # memory cache
                nature_name = nature.text.split('-')[1].strip()
                expense_nature = expense_natures.get(nature_name)
                if not expense_nature:
                    expense_nature, _ = ExpenseNature.objects.get_or_create(
                        name=nature_name, )
                    expense_natures[nature_name] = expense_nature

                my_table = nature.findNextSibling().find('table')
                tds = my_table.findAll('td')

                date = parse_date(tds[0].text)
                cpf_cnpj = self.normalize_cnpj_or_cpf(tds[1].text)
                supplier_name = tds[2].text
                expensed = parse_money(tds[3].text)

                try:
                    supplier = Supplier.objects.get(identifier=cpf_cnpj)
                except Supplier.DoesNotExist:
                    supplier = Supplier(identifier=cpf_cnpj,
                                        name=supplier_name)
                    supplier.save()
                    self.debug(u'New supplier found: {0}'.format(
                        unicode(supplier)))

                expense = ArchivedExpense(
                    original_id='',
                    number='',
                    nature=expense_nature,
                    date=date,
                    value=expensed,
                    expensed=expensed,
                    mandate=mandate,
                    supplier=supplier,
                    collection_run=self.collection_run,
                )
                expense.save()
                self.debug(u'New expense found: {0}'.format(unicode(expense)))
Ejemplo n.º 6
0
    def update_data_for_month(self, mandate, year, month):
        self.debug("Updating data for %d-%d - %s" %
                   (year, month, unicode(mandate)))
        uri = "%s/prestacao_contas/verbas_indenizatorias/deputados/%s/%d/%d?formato=json" % (
            self.almg_url, mandate.original_id, year, month)
        for entry in self.retrieve_uri(uri, headers=self.headers)["list"]:
            try:
                nature = ExpenseNature.objects.get(
                    original_id=entry["codTipoDespesa"])
            except ExpenseNature.DoesNotExist:
                nature = ExpenseNature(original_id=entry["codTipoDespesa"],
                                       name=entry["descTipoDespesa"])
                nature.save()

            for details in entry["listaDetalheVerba"]:
                cnpj = self.normalize_cnpj_or_cpf(details["cpfCnpj"])
                try:
                    supplier = Supplier.objects.get(identifier=cnpj)
                except Supplier.DoesNotExist:
                    supplier = Supplier(identifier=cnpj,
                                        name=details["nomeEmitente"])
                    supplier.save()

                if "descDocumento" in details:
                    number = details["descDocumento"]
                else:
                    self.debug("No document number, using reference date.")
                    number = details["dataReferencia"]["$"]

                date = details["dataEmissao"]["$"]
                value = details["valorDespesa"]
                expensed = details["valorReembolsado"]

                expense = ArchivedExpense(original_id=details["id"],
                                          number=number,
                                          nature=nature,
                                          date=date,
                                          value=value,
                                          expensed=expensed,
                                          mandate=mandate,
                                          supplier=supplier,
                                          collection_run=self.collection_run)
                expense.save()

                self.debug("New expense found: %s" % unicode(expense))
Ejemplo n.º 7
0
    def update_data(self):
        if os.path.exists('cdep-collection-run'):
            crid = int(open('cdep-collection-run').read())
            CollectionRun.objects.get(id=crid).delete()
            os.unlink('cdep-collection-run')

        self.collection_run = self.create_collection_run(self.legislature)

        data_path = os.path.join(os.getcwd(), 'data', 'cdep')

        files_to_download = ['AnoAtual.zip']
        previous_years = date.today().year - self.legislature.date_start.year

        if previous_years:
            files_to_download.append('AnoAnterior.zip')

        if previous_years > 1:
            files_to_download.append('AnosAnteriores.zip')

        files_to_process = list()
        for file_name in files_to_download:
            xml_file_name = file_name.replace('zip', 'xml')
            full_xml_path = os.path.join(data_path, xml_file_name)
            files_to_process.append(os.path.join(data_path, full_xml_path))

            full_path = os.path.join(data_path, file_name)

            headers = dict()
            if os.path.exists(full_path):
                headers['If-Modified-Since'] = http_date(
                    os.path.getmtime(full_path), usegmt=True)

            uri = 'http://www.camara.gov.br/cotas/' + file_name
            self.debug(u"Preparing to download %s…" % (uri))
            r = requests.get(uri, headers=headers, stream=True)

            if r.status_code == requests.codes.not_modified:
                self.debug(
                    u"File %s not updated since last download, skipping…" %
                    file_name)
                continue

            with open(full_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)

            self.debug(u"Unzipping %s…" % (file_name))
            zf = ZipFile(full_path, 'r')
            zf.extract(xml_file_name, data_path)

        open('cdep-collection-run', 'w').write('%d' % (self.collection_run.id))
        archived_expense_list = []
        for file_name in reversed(files_to_process):
            self.debug(u"Processing %s…" % file_name)
            objects_counter = 0

            context = iterparse(file_name, events=("start", "end"))

            # turn it into an iterator
            context = iter(context)

            for event, elem in context:
                if event != "end" or elem.tag != "DESPESA":
                    continue

                # Some entries lack numLegislatura, so we fallback to numAno.
                legislature_year = elem.find('nuLegislatura')
                if legislature_year is not None:
                    legislature_year = int(legislature_year.text)
                else:
                    legislature_year = int(elem.find('numAno').text)
                    if legislature_year < self.legislature.date_start.year or \
                       legislature_year > self.legislature.date_end.year:
                        legislature_year = None
                    else:
                        legislature_year = self.legislature.date_start.year

                if legislature_year != self.legislature.date_start.year:
                    self.debug(
                        u"Ignoring entry because it's out of the target legislature…"
                    )
                    continue

                name = elem.find('txNomeParlamentar').text.title().strip()

                nature = elem.find('txtDescricao').text.title().strip()

                supplier_name = elem.find('txtBeneficiario')
                if supplier_name is not None:
                    supplier_name = supplier_name.text.title().strip()
                else:
                    supplier_name = u'Sem nome'

                supplier_identifier = elem.find('txtCNPJCPF')
                if supplier_identifier is not None and supplier_identifier.text is not None:
                    supplier_identifier = self.normalize_cnpj_or_cpf(
                        supplier_identifier.text)

                if not supplier_identifier:
                    supplier_identifier = u'Sem CNPJ/CPF (%s)' % supplier_name

                try:
                    supplier = Supplier.objects.get(
                        identifier=supplier_identifier)
                except Supplier.DoesNotExist:
                    supplier = Supplier(identifier=supplier_identifier,
                                        name=supplier_name)
                    supplier.save()

                docnumber = elem.find('txtNumero').text
                if docnumber:
                    docnumber = docnumber.strip()

                expense_date = elem.find('datEmissao')
                if expense_date is not None:
                    expense_date = date(
                        *((int(x.lstrip('0'))
                           for x in expense_date.text[:10].split('-'))))
                else:
                    expense_year = int(elem.find('numAno').text)
                    expense_month = int(elem.find('numMes').text)
                    expense_date = date(expense_year, expense_month, 1)

                expensed = float(elem.find('vlrLiquido').text)

                nature, _ = ExpenseNature.objects.get_or_create(name=nature)

                party = party_name = elem.find('sgPartido')
                if party_name is not None:
                    party_name = self._normalize_party_name(party_name.text)
                    party, _ = PoliticalParty.objects.get_or_create(
                        siglum=party_name)

                state = elem.find('sgUF').text.strip()

                original_id = elem.find('ideCadastro').text.strip()

                try:
                    legislator = Legislator.objects.get(name__iexact=name)
                except Legislator.DoesNotExist:
                    # Some legislators do are not listed in the other WS because they are not
                    # in exercise.
                    self.debug(u"Found legislator who's not in exercise: %s" %
                               name)
                    legislator = Legislator(name=name)
                    legislator.save()
                mandate = self.mandate_for_legislator(legislator,
                                                      party,
                                                      state=state,
                                                      original_id=original_id)
                expense = ArchivedExpense(number=docnumber,
                                          nature=nature,
                                          date=expense_date,
                                          expensed=expensed,
                                          mandate=mandate,
                                          supplier=supplier,
                                          collection_run=self.collection_run)
                archived_expense_list.append(expense)
                self.debug(u"New expense found: %s" % unicode(expense))

                objects_counter += 1

                if objects_counter == OBJECT_LIST_MAXIMUM_COUNTER:
                    ArchivedExpense.objects.bulk_create(archived_expense_list)
                    archived_expense_list[:] = []
                    objects_counter = 0
                    reset_queries()

                elem.clear()
                while elem.getprevious() is not None:
                    del elem.getparent()[0]

                continue

        if archived_expense_list:
            ArchivedExpense.objects.bulk_create(archived_expense_list)

        os.unlink('cdep-collection-run')
Ejemplo n.º 8
0
    def process_expenses_obsolete(self, month, year, legislature, collection_run):
        data = self.retrieve_expenses_obsolete(month, year)
        if not data:
            return

        for x in data.findAll('g_deputado'):
            name = x.find('nm_deputado').getText().capitalize()
            legislator = self.add_legislator(name)

            try:
                mandate = Mandate.objects.get(
                    legislator=legislator,
                    date_start=legislature.date_start,
                    legislature=legislature)
                self.debug(u'Found existing Mandate: %s' % mandate)
            except Mandate.DoesNotExist:
                mandate = Mandate(
                    legislator=legislator,
                    date_start=legislature.date_start,
                    legislature=legislature)
                mandate.save()
                self.debug(u'New Mandate found: %s' % mandate)

            expense_type = x.find('list_g_tipo_despesa')

            for i in expense_type.findAll('g_tipo_despesa'):
                nature_text = i.find('nm_tipo_despesa').getText()
                try:
                    nature_text = nature_text.split('-', 1)[1].strip()
                except IndexError:
                    pass

                nature_text = nature_text.capitalize()

                ignore_list = [u'total', u'TOTAL', u'utilizado até 30/11/07']
                ignore_matches = [s for s in ignore_list if s in nature_text]
                if ignore_matches:
                    continue

                nature, nature_created = ExpenseNature.objects.get_or_create(
                    name=nature_text)

                if nature_created:
                    self.debug(u'New ExpenseNature found: %s' % nature)
                else:
                    self.debug(u'Found existing ExpenseNature: %s' % nature)

                m_month = i.find('nr_mes_ref').getText()
                m_year = i.find('nr_ano_ref').getText()
                date = parse_cmsp_date(m_month, m_year)

                for j in i.findAll('g_beneficiario'):
                    supplier_name = j.find('nm_beneficiario').getText()
                    supplier_name = supplier_name.capitalize()
                    cnpj = self.normalize_cnpj_or_cpf(j.find('nr_cnpj').getText())

                    if not cnpj and not supplier_name:
                        continue

                    try:
                        supplier = Supplier.objects.get(identifier=cnpj)
                        supplier_created = False
                    except Supplier.DoesNotExist:
                        supplier = Supplier(identifier=cnpj, name=supplier_name)
                        supplier.save()
                        supplier_created = True

                    if supplier_created:
                        self.debug(u'New Supplier found: %s' % supplier)
                    else:
                        self.debug(u'Found existing supplier: %s' % supplier)

                    expensed = parse_money(j.find('vl_desp').getText())

                    expense = ArchivedExpense(number='None',
                                              nature=nature,
                                              date=date,
                                              expensed=expensed,
                                              mandate=mandate,
                                              supplier=supplier,
                                              collection_run=collection_run)
                    expense.save()

                    self.debug(u'New expense found: %s' % expense)
Ejemplo n.º 9
0
    def process_expenses(self, month, year, legislature, collection_run):
        if year < 2015:
            return self.process_expenses_obsolete(month, year, legislature, collection_run)

        # CMSP now puts all data year to date on each file, so we need to get only the
        # last one for a given year - otherwise we duplicate data.
        today = datetime.now()
        if year == today.year and month < today.month:
            return
        elif year < today.year and month < 12:
            return

        data = self.retrieve_expenses(month, year)
        if not data:
            return

        for x in data.findAll('tabelaportalitemreembolso'):
            name = x.find('vereador').getText().capitalize()
            legislator = self.add_legislator(name)

            try:
                mandate = Mandate.objects.get(
                    legislator=legislator,
                    date_start=legislature.date_start,
                    legislature=legislature)
                self.debug(u'Found existing Mandate: %s' % mandate)
            except Mandate.DoesNotExist:
                mandate = Mandate(
                    legislator=legislator,
                    date_start=legislature.date_start,
                    legislature=legislature)
                mandate.save()
                self.debug(u'New Mandate found: %s' % mandate)

            nature_text = x.find('despesa').getText()

            try:
                nature_text = nature_text.split('-', 1)[1].strip()
            except IndexError:
                pass

            nature_text = nature_text.capitalize()

            nature, nature_created = ExpenseNature.objects.get_or_create(
                name=nature_text)

            if nature_created:
                self.debug(u'New ExpenseNature found: %s' % nature)
            else:
                self.debug(u'Found existing ExpenseNature: %s' % nature)

            m_month = x.find('mes').getText()
            m_year = x.find('ano').getText()
            date = parse_cmsp_date(m_month, m_year)

            supplier_name = x.find('fornecedor').getText()
            supplier_name = supplier_name.capitalize()
            cnpj = self.normalize_cnpj_or_cpf(x.find('cnpj').getText())

            if not cnpj and not supplier_name:
                continue

            try:
                supplier = Supplier.objects.get(identifier=cnpj)
                supplier_created = False
            except Supplier.DoesNotExist:
                supplier = Supplier(identifier=cnpj, name=supplier_name)
                supplier.save()
                supplier_created = True

            if supplier_created:
                self.debug(u'New Supplier found: %s' % supplier)
            else:
                self.debug(u'Found existing supplier: %s' % supplier)

            expensed = float(x.find('valor').getText())

            expense = ArchivedExpense(number='None',
                                      nature=nature,
                                      date=date,
                                      expensed=expensed,
                                      mandate=mandate,
                                      supplier=supplier,
                                      collection_run=collection_run)
            expense.save()

            self.debug(u'New expense found: %s' % expense)
Ejemplo n.º 10
0
    def update_data_for_year(self, year=datetime.now().year):
        self.debug("Updating data for year %d" % year)

        csv_data = self.retrieve_data_for_year(year).replace('\r\n', '\n')

        # FIXME: data containing quote-like characters (like ¨) break pandas parsing as well
        csv_data = csv_data.replace(u'¨', '')

        csv_data = re.sub(r'([^;\n])""+([^;\n])', r'\1"\2', csv_data)
        csv_data = re.sub(r'([^;\n])"([^;\n])', r'\1\2', csv_data)
        data = StringIO(csv_data)

        if data:
            df = pd.read_csv(data, skiprows=1, delimiter=";",
                             parse_dates=[7], decimal=',',
                             error_bad_lines=False,
                             encoding='utf-8').dropna(how='all')

            expected_header = [u'ANO',
                               u'MES',
                               u'SENADOR',
                               u'TIPO_DESPESA',
                               u'CNPJ_CPF',
                               u'FORNECEDOR',
                               u'DOCUMENTO',
                               u'DATA',
                               u'DETALHAMENTO',
                               u'VALOR_REEMBOLSADO']

            actual_header = df.columns.values.tolist()

            if actual_header != expected_header:
                print u'Bad CSV: expected header %s, got %s' % (expected_header, actual_header)
                return

            archived_expense_list = []
            objects_counter = 0
            archived_expense_list_counter = len(df.index)

            for idx, row in df.iterrows():
                name = row["SENADOR"]
                nature = row["TIPO_DESPESA"]
                cpf_cnpj = self.normalize_cnpj_or_cpf(row["CNPJ_CPF"])
                supplier_name = row["FORNECEDOR"]
                docnumber = row["DOCUMENTO"]
                expense_date = row["DATA"]
                expensed = row['VALOR_REEMBOLSADO']

                # FIXME: WTF?
                if isinstance(expensed, unicode):
                    expensed = float(expensed.replace(',', '.').replace('\r\n', ''))

                nature, _ = ExpenseNature.objects.get_or_create(name=nature)

                try:
                    supplier = Supplier.objects.get(identifier=cpf_cnpj)
                except Supplier.DoesNotExist:
                    supplier = Supplier(identifier=cpf_cnpj, name=supplier_name)
                    supplier.save()

                legislator, _ = self.try_name_disambiguation(name)
                if not legislator:
                    legislator = Legislator.objects.get(name__iexact=name)
                mandate = self.mandate_for_legislator(legislator, None)
                expense = ArchivedExpense(number=docnumber,
                                          nature=nature,
                                          date=expense_date,
                                          expensed=expensed,
                                          mandate=mandate,
                                          supplier=supplier,
                                          collection_run=self.collection_run)
                archived_expense_list.append(expense)
                self.debug("New expense found: %s" % unicode(expense))

                objects_counter += 1
                archived_expense_list_counter -= 1

                # We create a list with up to OBJECT_LIST_MAXIMUM_COUNTER.
                # If that lists is equal to the maximum object count allowed
                # or if there are no more objects in archived_expense_list,
                # we bulk_create() them and clear the list.

                if objects_counter == OBJECT_LIST_MAXIMUM_COUNTER or archived_expense_list_counter == 0:
                    ArchivedExpense.objects.bulk_create(archived_expense_list)
                    archived_expense_list[:] = []
                    objects_counter = 0
                    reset_queries()
        else:
            self.debug("Error downloading file for year %d" % year)
Ejemplo n.º 11
0
    def update_data_for_year(self, year=datetime.now().year):
        self.debug("Updating data for year %d" % year)

        data = StringIO(self.retrieve_data_for_year(year))

        if data:
            df = pd.read_csv(data,
                             skiprows=1,
                             delimiter=";",
                             parse_dates=[7],
                             decimal=',',
                             error_bad_lines=False).dropna(how='all')

            expected_header = [
                u'ANO', u'MES', u'SENADOR', u'TIPO_DESPESA', u'CNPJ_CPF',
                u'FORNECEDOR', u'DOCUMENTO', u'DATA', u'DETALHAMENTO',
                u'VALOR_REEMBOLSADO'
            ]

            actual_header = df.columns.values.tolist()

            if actual_header != expected_header:
                print u'Bad CSV: expected header %s, got %s' % (
                    expected_header, actual_header)
                return

            archived_expense_list = []
            objects_counter = 0
            archived_expense_list_counter = len(df.index)

            for idx, row in df.iterrows():
                name = row["SENADOR"]
                nature = row["TIPO_DESPESA"]
                cpf_cnpj = row["CNPJ_CPF"].replace('.', '').replace(
                    '-', '').replace('/', '')
                supplier_name = row["FORNECEDOR"]
                docnumber = row["DOCUMENTO"]
                expense_date = row["DATA"]
                expensed = row['VALOR_REEMBOLSADO']

                nature, _ = ExpenseNature.objects.get_or_create(name=nature)

                try:
                    supplier = Supplier.objects.get(identifier=cpf_cnpj)
                except Supplier.DoesNotExist:
                    supplier = Supplier(identifier=cpf_cnpj,
                                        name=supplier_name)
                    supplier.save()

                try:
                    legislator = Legislator.objects.get(name__iexact=name)
                    mandate = self.mandate_for_legislator(legislator, None)
                    expense = ArchivedExpense(
                        number=docnumber,
                        nature=nature,
                        date=expense_date,
                        expensed=expensed,
                        mandate=mandate,
                        supplier=supplier,
                        collection_run=self.collection_run)
                    archived_expense_list.append(expense)
                    self.debug("New expense found: %s" % unicode(expense))

                    objects_counter += 1
                    archived_expense_list_counter -= 1

                    # We create a list with up to OBJECT_LIST_MAXIMUM_COUNTER.
                    # If that lists is equal to the maximum object count allowed
                    # or if there are no more objects in archived_expense_list,
                    # we bulk_create() them and clear the list.

                    if objects_counter == OBJECT_LIST_MAXIMUM_COUNTER or archived_expense_list_counter == 0:
                        ArchivedExpense.objects.bulk_create(
                            archived_expense_list)
                        archived_expense_list[:] = []
                        objects_counter = 0
                        reset_queries()

                except Exception:
                    pass
        else:
            self.debug("Error downloading file for year %d" % year)
Ejemplo n.º 12
0
    def update_data(self):
        if os.path.exists('cdep-collection-run'):
            crid = int(open('cdep-collection-run').read())
            CollectionRun.objects.get(id=crid).delete()
            os.unlink('cdep-collection-run')

        self.collection_run = self.create_collection_run(self.legislature)

        data_path = os.path.join(os.getcwd(), 'data', 'cdep')

        files_to_download = ['AnoAtual.zip']
        previous_years = date.today().year - self.legislature.date_start.year

        if previous_years:
            files_to_download.append('AnoAnterior.zip')

        if previous_years > 1:
            files_to_download.append('AnosAnteriores.zip')

        files_to_process = list()
        for file_name in files_to_download:
            xml_file_name = file_name.replace('zip', 'xml')
            full_xml_path = os.path.join(data_path, xml_file_name)
            files_to_process.append(os.path.join(data_path, full_xml_path))

            full_path = os.path.join(data_path, file_name)

            headers = dict()
            if os.path.exists(full_path):
                headers['If-Modified-Since'] = http_date(os.path.getmtime(full_path), usegmt=True)

            uri = 'http://www.camara.gov.br/cotas/' + file_name
            self.debug(u"Preparing to download %s…" % (uri))
            r = requests.get(uri, headers=headers, stream=True)

            if r.status_code == requests.codes.not_modified:
                self.debug(u"File %s not updated since last download, skipping…" % file_name)
                continue

            with open(full_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)

            self.debug(u"Unzipping %s…" % (file_name))
            zf = ZipFile(full_path, 'r')
            zf.extract(xml_file_name, data_path)

        open('cdep-collection-run', 'w').write('%d' % (self.collection_run.id))
        archived_expense_list = []
        for file_name in reversed(files_to_process):
            self.debug(u"Processing %s…" % file_name)
            objects_counter = 0

            context = iterparse(file_name, events=("start", "end"))

            # turn it into an iterator
            context = iter(context)

            for event, elem in context:
                if event != "end" or elem.tag != "DESPESA":
                    continue

                # Some entries lack numLegislatura, so we fallback to numAno.
                legislature_year = elem.find('nuLegislatura')
                if legislature_year is not None:
                    legislature_year = int(legislature_year.text)
                else:
                    legislature_year = int(elem.find('numAno').text)
                    if legislature_year < self.legislature.date_start.year or \
                       legislature_year > self.legislature.date_end.year:
                        legislature_year = None
                    else:
                        legislature_year = self.legislature.date_start.year

                if legislature_year != self.legislature.date_start.year:
                    self.debug(u"Ignoring entry because it's out of the target legislature…")
                    continue

                name = elem.find('txNomeParlamentar').text.title()

                nature = elem.find('txtDescricao').text.title()

                supplier_name = elem.find('txtBeneficiario')
                if supplier_name is not None:
                    supplier_name = supplier_name.text.title()
                else:
                    supplier_name = u'Sem nome'

                supplier_identifier = elem.find('txtCNPJCPF')
                if supplier_identifier is not None:
                    supplier_identifier = self.normalize_cnpj_or_cpf(supplier_identifier.text)

                if not supplier_identifier:
                    supplier_identifier = u'Sem CNPJ/CPF (%s)' % supplier_name

                try:
                    supplier = Supplier.objects.get(identifier=supplier_identifier)
                except Supplier.DoesNotExist:
                    supplier = Supplier(identifier=supplier_identifier, name=supplier_name)
                    supplier.save()

                docnumber = elem.find('txtNumero').text

                expense_date = elem.find('datEmissao')
                if expense_date is not None:
                    expense_date = date(*((int(x.lstrip('0')) for x in expense_date.text[:10].split('-'))))
                else:
                    expense_year = int(elem.find('numAno').text)
                    expense_month = int(elem.find('numMes').text)
                    expense_date = date(expense_year, expense_month, 1)

                expensed = float(elem.find('vlrLiquido').text)

                nature, _ = ExpenseNature.objects.get_or_create(name=nature)

                party = party_name = elem.find('sgPartido')
                if party_name is not None:
                    party_name = self._normalize_party_name(party_name.text)
                    party, _ = PoliticalParty.objects.get_or_create(siglum=party_name)

                state = elem.find('sgUF').text

                original_id = elem.find('ideCadastro').text

                try:
                    legislator = Legislator.objects.get(name__iexact=name)
                except Legislator.DoesNotExist:
                    # Some legislators do are not listed in the other WS because they are not
                    # in exercise.
                    self.debug(u"Found legislator who's not in exercise: %s" % name)
                    legislator = Legislator(name=name)
                    legislator.save()
                mandate = self.mandate_for_legislator(legislator, party,
                                                      state=state, original_id=original_id)
                expense = ArchivedExpense(number=docnumber,
                                          nature=nature,
                                          date=expense_date,
                                          expensed=expensed,
                                          mandate=mandate,
                                          supplier=supplier,
                                          collection_run=self.collection_run)
                archived_expense_list.append(expense)
                self.debug(u"New expense found: %s" % unicode(expense))

                objects_counter += 1

                if objects_counter == OBJECT_LIST_MAXIMUM_COUNTER:
                    ArchivedExpense.objects.bulk_create(archived_expense_list)
                    archived_expense_list[:] = []
                    objects_counter = 0
                    reset_queries()

                elem.clear()
                while elem.getprevious() is not None:
                    del elem.getparent()[0]

                continue

        if archived_expense_list:
            ArchivedExpense.objects.bulk_create(archived_expense_list)

        os.unlink('cdep-collection-run')
Ejemplo n.º 13
0
    def update_data_for_year(self, year=datetime.now().year):
        self.debug("Updating data for year %d" % year)

        data = StringIO(self.retrieve_data_for_year(year))

        if data:
            df = pd.read_csv(data, skiprows=1, delimiter=";",
                             parse_dates=[7], decimal=',',
                             error_bad_lines=False).dropna(how='all')

            expected_header = [u'ANO',
                               u'MES',
                               u'SENADOR',
                               u'TIPO_DESPESA',
                               u'CNPJ_CPF',
                               u'FORNECEDOR',
                               u'DOCUMENTO',
                               u'DATA',
                               u'DETALHAMENTO',
                               u'VALOR_REEMBOLSADO']

            actual_header = df.columns.values.tolist()

            if actual_header != expected_header:
                print u'Bad CSV: expected header %s, got %s' % (expected_header, actual_header)
                return

            archived_expense_list = []
            objects_counter = 0
            archived_expense_list_counter = len(df.index)

            for idx, row in df.iterrows():
                name = row["SENADOR"]
                nature = row["TIPO_DESPESA"]
                cpf_cnpj = row["CNPJ_CPF"].replace('.', '').replace('-', '').replace('/', '')
                supplier_name = row["FORNECEDOR"]
                docnumber = row["DOCUMENTO"]
                expense_date = row["DATA"]
                expensed = row['VALOR_REEMBOLSADO']

                nature, _ = ExpenseNature.objects.get_or_create(name=nature)

                try:
                    supplier = Supplier.objects.get(identifier=cpf_cnpj)
                except Supplier.DoesNotExist:
                    supplier = Supplier(identifier=cpf_cnpj, name=supplier_name)
                    supplier.save()

                try:
                    legislator = Legislator.objects.get(name__iexact=name)
                    mandate = self.mandate_for_legislator(legislator, None)
                    expense = ArchivedExpense(number=docnumber,
                                              nature=nature,
                                              date=expense_date,
                                              expensed=expensed,
                                              mandate=mandate,
                                              supplier=supplier,
                                              collection_run=self.collection_run)
                    archived_expense_list.append(expense)
                    self.debug("New expense found: %s" % unicode(expense))

                    objects_counter += 1
                    archived_expense_list_counter -= 1

                    # We create a list with up to OBJECT_LIST_MAXIMUM_COUNTER.
                    # If that lists is equal to the maximum object count allowed
                    # or if there are no more objects in archived_expense_list,
                    # we bulk_create() them and clear the list.

                    if objects_counter == OBJECT_LIST_MAXIMUM_COUNTER or archived_expense_list_counter == 0:
                        ArchivedExpense.objects.bulk_create(archived_expense_list)
                        archived_expense_list[:] = []
                        objects_counter = 0
                        reset_queries()

                except Exception:
                    pass
        else:
            self.debug("Error downloading file for year %d" % year)
Ejemplo n.º 14
0
    def update_data_for_year(self, year=datetime.now().year):
        self.debug("Updating data for year %d" % year)

        csv_data = self.retrieve_data_for_year(year).replace('\r\n', '\n')

        # FIXME: data containing quote-like characters (like ¨) break pandas parsing as well
        csv_data = csv_data.replace(u'¨', '')

        csv_data = re.sub(r'([^;\n])""+([^;\n])', r'\1"\2', csv_data)
        csv_data = re.sub(r'([^;\n])"([^;\n])', r'\1\2', csv_data)
        data = StringIO(csv_data)

        if data:
            df = pd.read_csv(data,
                             skiprows=1,
                             delimiter=";",
                             parse_dates=[7],
                             decimal=',',
                             error_bad_lines=False,
                             encoding='utf-8').dropna(how='all')

            expected_header = [
                u'ANO', u'MES', u'SENADOR', u'TIPO_DESPESA', u'CNPJ_CPF',
                u'FORNECEDOR', u'DOCUMENTO', u'DATA', u'DETALHAMENTO',
                u'VALOR_REEMBOLSADO'
            ]

            actual_header = df.columns.values.tolist()

            if actual_header != expected_header:
                print u'Bad CSV: expected header %s, got %s' % (
                    expected_header, actual_header)
                return

            archived_expense_list = []
            objects_counter = 0
            archived_expense_list_counter = len(df.index)

            for idx, row in df.iterrows():
                name = row["SENADOR"]
                nature = row["TIPO_DESPESA"]
                cpf_cnpj = self.normalize_cnpj_or_cpf(row["CNPJ_CPF"])
                supplier_name = row["FORNECEDOR"]
                docnumber = row["DOCUMENTO"]
                expense_date = row["DATA"]
                expensed = row['VALOR_REEMBOLSADO']

                # FIXME: WTF?
                if isinstance(expensed, unicode):
                    expensed = float(
                        expensed.replace(',', '.').replace('\r\n', ''))

                nature, _ = ExpenseNature.objects.get_or_create(name=nature)

                try:
                    supplier = Supplier.objects.get(identifier=cpf_cnpj)
                except Supplier.DoesNotExist:
                    supplier = Supplier(identifier=cpf_cnpj,
                                        name=supplier_name)
                    supplier.save()

                legislator, _ = self.try_name_disambiguation(name)
                if not legislator:
                    legislator = Legislator.objects.get(name__iexact=name)
                mandate = self.mandate_for_legislator(legislator, None)
                expense = ArchivedExpense(number=docnumber,
                                          nature=nature,
                                          date=expense_date,
                                          expensed=expensed,
                                          mandate=mandate,
                                          supplier=supplier,
                                          collection_run=self.collection_run)
                archived_expense_list.append(expense)
                self.debug("New expense found: %s" % unicode(expense))

                objects_counter += 1
                archived_expense_list_counter -= 1

                # We create a list with up to OBJECT_LIST_MAXIMUM_COUNTER.
                # If that lists is equal to the maximum object count allowed
                # or if there are no more objects in archived_expense_list,
                # we bulk_create() them and clear the list.

                if objects_counter == OBJECT_LIST_MAXIMUM_COUNTER or archived_expense_list_counter == 0:
                    ArchivedExpense.objects.bulk_create(archived_expense_list)
                    archived_expense_list[:] = []
                    objects_counter = 0
                    reset_queries()
        else:
            self.debug("Error downloading file for year %d" % year)
Ejemplo n.º 15
0
    def update_data_for_year(self, year):
        self.debug(u'Updating data for year {0}'.format(year))

        try:
            csv_data = self.retrieve_data_for_year(year).replace('\r\n', '\n')
        except Exception:
            print u'Not found data for year {0}'.format(year)
            return

        # Skip first line
        head, tail = csv_data.split('\n', 1)
        self.debug(u'Reading file...')
        data = rows.import_from_csv(BytesIO(tail.encode('utf-8')))

        if not data:
            self.debug(u'Error downloading file for year {0}'.format(year))
            return

        expected_header = [
            u'ano',
            u'mes',
            u'senador',
            u'tipo_despesa',
            u'cnpj_cpf',
            u'fornecedor',
            u'documento',
            u'data',
            u'detalhamento',
            u'valor_reembolsado',
        ]

        actual_header = data.fields.keys()

        if actual_header != expected_header:
            # FIXME
            print u'Bad CSV: expected header {0}, got {1}'.format(
                expected_header, actual_header)
            return

        archived_expense_list = []
        objects_counter = 0
        archived_expense_list_counter = len(data)

        legislators = {}
        mandates = {}
        natures = {}

        for row in data:
            if not row.senador:
                self.debug(u'Error downloading file for year {0}')
                continue

            if not row.data:
                date = '01/{0}/{1}'.format(row.mes, row.ano)
                expense_date = datetime.strptime(date, '%d/%m/%Y')
            else:
                expense_date = datetime.strptime(row.data, '%d/%m/%Y')

            name = self._normalize_name(row.senador)
            nature = row.tipo_despesa
            cpf_cnpj = self.normalize_cnpj_or_cpf(row.cnpj_cpf)
            supplier_name = row.fornecedor
            docnumber = row.documento
            expensed = row.valor_reembolsado

            # FIXME: WTF?
            if isinstance(expensed, unicode):
                expensed = float(
                    expensed.replace(',', '.').replace('\r',
                                                       '').replace('\n', ''))

            # memory cache
            expense_nature = natures.get(nature)
            if not expense_nature:
                expense_nature, _ = ExpenseNature.objects.get_or_create(
                    name=nature)
                natures[nature] = expense_nature

            try:
                supplier = Supplier.objects.get(identifier=cpf_cnpj)
            except Supplier.DoesNotExist:
                supplier = Supplier(identifier=cpf_cnpj, name=supplier_name)
                supplier.save()
                self.debug(u'New supplier found: {0}'.format(
                    unicode(supplier)))

            # memory cache
            legislator = legislators.get(name)
            if not legislator:
                legislator = self._get_or_create_legislator(name)
                legislators[name] = legislator

            # memory cache
            mandate = mandates.get(name)
            if not mandate:
                mandate = self.mandate_for_legislator(legislator, None)
                mandates[name] = mandate

            expense = ArchivedExpense(number=docnumber,
                                      nature=expense_nature,
                                      date=expense_date,
                                      expensed=expensed,
                                      mandate=mandate,
                                      supplier=supplier,
                                      collection_run=self.collection_run)
            archived_expense_list.append(expense)
            self.debug(u'New expense found: {0}'.format(unicode(expense)))

            objects_counter += 1
            archived_expense_list_counter -= 1

            # We create a list with up to OBJECT_LIST_MAXIMUM_COUNTER.
            # If that lists is equal to the maximum object count allowed
            # or if there are no more objects in archived_expense_list,
            # we bulk_create() them and clear the list.

            if objects_counter == OBJECT_LIST_MAXIMUM_COUNTER or archived_expense_list_counter == 0:
                ArchivedExpense.objects.bulk_create(archived_expense_list)
                archived_expense_list[:] = []
                objects_counter = 0
                reset_queries()