def update_data_for_month(self, mandate, year, month): for data in self.find_data_for_month(mandate, year, month): nature = self.get_or_create_expense_nature( '{0}: {1}'.format(data['budget_title'], data['budget_subtitle']) ) name = data.get('nome') or 'Sem nome' no_identifier = u'Sem CPF/CNPJ ({0})'.format(name) cpf_cnpj = self.normalize_cnpj_or_cpf(data.get('cpf_cnpj')) or no_identifier try: supplier = Supplier.objects.get(identifier=cpf_cnpj) except Supplier.DoesNotExist: supplier = Supplier(identifier=cpf_cnpj, name=name) supplier.save() # FIXME except Supplier.MultipleObjectsReturned: supplier = Supplier.objects.filter(identifier=cpf_cnpj)[1] date = datetime.strptime(data['date'], '%d/%m/%Y') expense = ArchivedExpense( number=data.get('number', ''), nature=nature, date=date, value=data['value_presented'], expensed=data['value_expensed'], mandate=mandate, supplier=supplier, collection_run=self.collection_run, ) expense.save()
def get_or_create_supplier(self, identifier, name=None): identifier = self.normalize_cnpj_or_cpf(identifier) try: supplier = Supplier.objects.get(identifier=identifier) except Supplier.DoesNotExist: supplier = Supplier(identifier=identifier, name=name) supplier.save() self.debug(u'New supplier found: {0}'.format(unicode(supplier))) return supplier
def update_data_for_legislator(self, data, code, month, year): self.debug("Updating data %s/%s for legislator: %s" % (month, year, code)) data = data.find('div', {'class': 'row'}) legislator = data.find('h2').findChildren()[0].next legislator = self._normalize_name(legislator) legislator, created = Legislator.objects.get_or_create(name=legislator) if created: self.debug("New legislator: %s" % unicode(legislator)) else: self.debug("Found existing legislator: %s" % unicode(legislator)) mandate = self.mandate_for_legislator(legislator, party=None, original_id=code) natures = data.findAll('h3') for data in natures: nature, _ = ExpenseNature.objects.get_or_create(name=self._normalize_nature(data.text)) rows = data.findNext().findAll('tr')[1:-1] for row in rows: columns = row.findAll('td') if not len(columns) == 5: print u'Bad row: %s' % unicode(columns) continue cnpj = self.normalize_cnpj_or_cpf(columns[0].getText()) supplier_name = columns[1].getText().strip() try: supplier_name = supplier_name.decode('utf-8') except Exception: pass try: supplier = Supplier.objects.get(identifier=cnpj) except Supplier.DoesNotExist: supplier = Supplier(identifier=cnpj, name=supplier_name) supplier.save() docnumber = columns[2].getText() expensed = parse_money(columns[3].getText()) expense = ArchivedExpense(number=docnumber, nature=nature, date=date(year, month, 1), expensed=expensed, mandate=mandate, supplier=supplier, collection_run=self.collection_run) expense.save() self.debug("New expense found: %s" % unicode(expense))
def _update_data_for_year(self, mandates, year, month): for mandate in mandates: url = '{0}/?dep={1}&ano={2}&mes={3}'.format( TRANSPARENCIA_URL, mandate.original_id, year, month, ) expense_natures = {} expenses_data = self.retrieve_uri(url) natures = expenses_data.find(id='div-com-verba').findAll('h4') for nature in natures: # memory cache nature_name = nature.text.split('-')[1].strip() expense_nature = expense_natures.get(nature_name) if not expense_nature: expense_nature, _ = ExpenseNature.objects.get_or_create( name=nature_name, ) expense_natures[nature_name] = expense_nature my_table = nature.findNextSibling().find('table') tds = my_table.findAll('td') date = parse_date(tds[0].text) cpf_cnpj = self.normalize_cnpj_or_cpf(tds[1].text) supplier_name = tds[2].text expensed = parse_money(tds[3].text) try: supplier = Supplier.objects.get(identifier=cpf_cnpj) except Supplier.DoesNotExist: supplier = Supplier(identifier=cpf_cnpj, name=supplier_name) supplier.save() self.debug(u'New supplier found: {0}'.format( unicode(supplier))) expense = ArchivedExpense( original_id='', number='', nature=expense_nature, date=date, value=expensed, expensed=expensed, mandate=mandate, supplier=supplier, collection_run=self.collection_run, ) expense.save() self.debug(u'New expense found: {0}'.format(unicode(expense)))
def update_data_for_month(self, mandate, year, month): self.debug("Updating data for %d-%d - %s" % (year, month, unicode(mandate))) uri = "%s/prestacao_contas/verbas_indenizatorias/deputados/%s/%d/%d?formato=json" % ( self.almg_url, mandate.original_id, year, month) for entry in self.retrieve_uri(uri, headers=self.headers)["list"]: try: nature = ExpenseNature.objects.get( original_id=entry["codTipoDespesa"]) except ExpenseNature.DoesNotExist: nature = ExpenseNature(original_id=entry["codTipoDespesa"], name=entry["descTipoDespesa"]) nature.save() for details in entry["listaDetalheVerba"]: cnpj = self.normalize_cnpj_or_cpf(details["cpfCnpj"]) try: supplier = Supplier.objects.get(identifier=cnpj) except Supplier.DoesNotExist: supplier = Supplier(identifier=cnpj, name=details["nomeEmitente"]) supplier.save() if "descDocumento" in details: number = details["descDocumento"] else: self.debug("No document number, using reference date.") number = details["dataReferencia"]["$"] date = details["dataEmissao"]["$"] value = details["valorDespesa"] expensed = details["valorReembolsado"] expense = ArchivedExpense(original_id=details["id"], number=number, nature=nature, date=date, value=value, expensed=expensed, mandate=mandate, supplier=supplier, collection_run=self.collection_run) expense.save() self.debug("New expense found: %s" % unicode(expense))
def update_data(self): if os.path.exists('cdep-collection-run'): crid = int(open('cdep-collection-run').read()) CollectionRun.objects.get(id=crid).delete() os.unlink('cdep-collection-run') self.collection_run = self.create_collection_run(self.legislature) data_path = os.path.join(os.getcwd(), 'data', 'cdep') files_to_download = ['AnoAtual.zip'] previous_years = date.today().year - self.legislature.date_start.year if previous_years: files_to_download.append('AnoAnterior.zip') if previous_years > 1: files_to_download.append('AnosAnteriores.zip') files_to_process = list() for file_name in files_to_download: xml_file_name = file_name.replace('zip', 'xml') full_xml_path = os.path.join(data_path, xml_file_name) files_to_process.append(os.path.join(data_path, full_xml_path)) full_path = os.path.join(data_path, file_name) headers = dict() if os.path.exists(full_path): headers['If-Modified-Since'] = http_date( os.path.getmtime(full_path), usegmt=True) uri = 'http://www.camara.gov.br/cotas/' + file_name self.debug(u"Preparing to download %s…" % (uri)) r = requests.get(uri, headers=headers, stream=True) if r.status_code == requests.codes.not_modified: self.debug( u"File %s not updated since last download, skipping…" % file_name) continue with open(full_path, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): if chunk: f.write(chunk) self.debug(u"Unzipping %s…" % (file_name)) zf = ZipFile(full_path, 'r') zf.extract(xml_file_name, data_path) open('cdep-collection-run', 'w').write('%d' % (self.collection_run.id)) archived_expense_list = [] for file_name in reversed(files_to_process): self.debug(u"Processing %s…" % file_name) objects_counter = 0 context = iterparse(file_name, events=("start", "end")) # turn it into an iterator context = iter(context) for event, elem in context: if event != "end" or elem.tag != "DESPESA": continue # Some entries lack numLegislatura, so we fallback to numAno. legislature_year = elem.find('nuLegislatura') if legislature_year is not None: legislature_year = int(legislature_year.text) else: legislature_year = int(elem.find('numAno').text) if legislature_year < self.legislature.date_start.year or \ legislature_year > self.legislature.date_end.year: legislature_year = None else: legislature_year = self.legislature.date_start.year if legislature_year != self.legislature.date_start.year: self.debug( u"Ignoring entry because it's out of the target legislature…" ) continue name = elem.find('txNomeParlamentar').text.title().strip() nature = elem.find('txtDescricao').text.title().strip() supplier_name = elem.find('txtBeneficiario') if supplier_name is not None: supplier_name = supplier_name.text.title().strip() else: supplier_name = u'Sem nome' supplier_identifier = elem.find('txtCNPJCPF') if supplier_identifier is not None and supplier_identifier.text is not None: supplier_identifier = self.normalize_cnpj_or_cpf( supplier_identifier.text) if not supplier_identifier: supplier_identifier = u'Sem CNPJ/CPF (%s)' % supplier_name try: supplier = Supplier.objects.get( identifier=supplier_identifier) except Supplier.DoesNotExist: supplier = Supplier(identifier=supplier_identifier, name=supplier_name) supplier.save() docnumber = elem.find('txtNumero').text if docnumber: docnumber = docnumber.strip() expense_date = elem.find('datEmissao') if expense_date is not None: expense_date = date( *((int(x.lstrip('0')) for x in expense_date.text[:10].split('-')))) else: expense_year = int(elem.find('numAno').text) expense_month = int(elem.find('numMes').text) expense_date = date(expense_year, expense_month, 1) expensed = float(elem.find('vlrLiquido').text) nature, _ = ExpenseNature.objects.get_or_create(name=nature) party = party_name = elem.find('sgPartido') if party_name is not None: party_name = self._normalize_party_name(party_name.text) party, _ = PoliticalParty.objects.get_or_create( siglum=party_name) state = elem.find('sgUF').text.strip() original_id = elem.find('ideCadastro').text.strip() try: legislator = Legislator.objects.get(name__iexact=name) except Legislator.DoesNotExist: # Some legislators do are not listed in the other WS because they are not # in exercise. self.debug(u"Found legislator who's not in exercise: %s" % name) legislator = Legislator(name=name) legislator.save() mandate = self.mandate_for_legislator(legislator, party, state=state, original_id=original_id) expense = ArchivedExpense(number=docnumber, nature=nature, date=expense_date, expensed=expensed, mandate=mandate, supplier=supplier, collection_run=self.collection_run) archived_expense_list.append(expense) self.debug(u"New expense found: %s" % unicode(expense)) objects_counter += 1 if objects_counter == OBJECT_LIST_MAXIMUM_COUNTER: ArchivedExpense.objects.bulk_create(archived_expense_list) archived_expense_list[:] = [] objects_counter = 0 reset_queries() elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] continue if archived_expense_list: ArchivedExpense.objects.bulk_create(archived_expense_list) os.unlink('cdep-collection-run')
def process_expenses_obsolete(self, month, year, legislature, collection_run): data = self.retrieve_expenses_obsolete(month, year) if not data: return for x in data.findAll('g_deputado'): name = x.find('nm_deputado').getText().capitalize() legislator = self.add_legislator(name) try: mandate = Mandate.objects.get( legislator=legislator, date_start=legislature.date_start, legislature=legislature) self.debug(u'Found existing Mandate: %s' % mandate) except Mandate.DoesNotExist: mandate = Mandate( legislator=legislator, date_start=legislature.date_start, legislature=legislature) mandate.save() self.debug(u'New Mandate found: %s' % mandate) expense_type = x.find('list_g_tipo_despesa') for i in expense_type.findAll('g_tipo_despesa'): nature_text = i.find('nm_tipo_despesa').getText() try: nature_text = nature_text.split('-', 1)[1].strip() except IndexError: pass nature_text = nature_text.capitalize() ignore_list = [u'total', u'TOTAL', u'utilizado até 30/11/07'] ignore_matches = [s for s in ignore_list if s in nature_text] if ignore_matches: continue nature, nature_created = ExpenseNature.objects.get_or_create( name=nature_text) if nature_created: self.debug(u'New ExpenseNature found: %s' % nature) else: self.debug(u'Found existing ExpenseNature: %s' % nature) m_month = i.find('nr_mes_ref').getText() m_year = i.find('nr_ano_ref').getText() date = parse_cmsp_date(m_month, m_year) for j in i.findAll('g_beneficiario'): supplier_name = j.find('nm_beneficiario').getText() supplier_name = supplier_name.capitalize() cnpj = self.normalize_cnpj_or_cpf(j.find('nr_cnpj').getText()) if not cnpj and not supplier_name: continue try: supplier = Supplier.objects.get(identifier=cnpj) supplier_created = False except Supplier.DoesNotExist: supplier = Supplier(identifier=cnpj, name=supplier_name) supplier.save() supplier_created = True if supplier_created: self.debug(u'New Supplier found: %s' % supplier) else: self.debug(u'Found existing supplier: %s' % supplier) expensed = parse_money(j.find('vl_desp').getText()) expense = ArchivedExpense(number='None', nature=nature, date=date, expensed=expensed, mandate=mandate, supplier=supplier, collection_run=collection_run) expense.save() self.debug(u'New expense found: %s' % expense)
def process_expenses(self, month, year, legislature, collection_run): if year < 2015: return self.process_expenses_obsolete(month, year, legislature, collection_run) # CMSP now puts all data year to date on each file, so we need to get only the # last one for a given year - otherwise we duplicate data. today = datetime.now() if year == today.year and month < today.month: return elif year < today.year and month < 12: return data = self.retrieve_expenses(month, year) if not data: return for x in data.findAll('tabelaportalitemreembolso'): name = x.find('vereador').getText().capitalize() legislator = self.add_legislator(name) try: mandate = Mandate.objects.get( legislator=legislator, date_start=legislature.date_start, legislature=legislature) self.debug(u'Found existing Mandate: %s' % mandate) except Mandate.DoesNotExist: mandate = Mandate( legislator=legislator, date_start=legislature.date_start, legislature=legislature) mandate.save() self.debug(u'New Mandate found: %s' % mandate) nature_text = x.find('despesa').getText() try: nature_text = nature_text.split('-', 1)[1].strip() except IndexError: pass nature_text = nature_text.capitalize() nature, nature_created = ExpenseNature.objects.get_or_create( name=nature_text) if nature_created: self.debug(u'New ExpenseNature found: %s' % nature) else: self.debug(u'Found existing ExpenseNature: %s' % nature) m_month = x.find('mes').getText() m_year = x.find('ano').getText() date = parse_cmsp_date(m_month, m_year) supplier_name = x.find('fornecedor').getText() supplier_name = supplier_name.capitalize() cnpj = self.normalize_cnpj_or_cpf(x.find('cnpj').getText()) if not cnpj and not supplier_name: continue try: supplier = Supplier.objects.get(identifier=cnpj) supplier_created = False except Supplier.DoesNotExist: supplier = Supplier(identifier=cnpj, name=supplier_name) supplier.save() supplier_created = True if supplier_created: self.debug(u'New Supplier found: %s' % supplier) else: self.debug(u'Found existing supplier: %s' % supplier) expensed = float(x.find('valor').getText()) expense = ArchivedExpense(number='None', nature=nature, date=date, expensed=expensed, mandate=mandate, supplier=supplier, collection_run=collection_run) expense.save() self.debug(u'New expense found: %s' % expense)
def update_data_for_year(self, year=datetime.now().year): self.debug("Updating data for year %d" % year) csv_data = self.retrieve_data_for_year(year).replace('\r\n', '\n') # FIXME: data containing quote-like characters (like ¨) break pandas parsing as well csv_data = csv_data.replace(u'¨', '') csv_data = re.sub(r'([^;\n])""+([^;\n])', r'\1"\2', csv_data) csv_data = re.sub(r'([^;\n])"([^;\n])', r'\1\2', csv_data) data = StringIO(csv_data) if data: df = pd.read_csv(data, skiprows=1, delimiter=";", parse_dates=[7], decimal=',', error_bad_lines=False, encoding='utf-8').dropna(how='all') expected_header = [u'ANO', u'MES', u'SENADOR', u'TIPO_DESPESA', u'CNPJ_CPF', u'FORNECEDOR', u'DOCUMENTO', u'DATA', u'DETALHAMENTO', u'VALOR_REEMBOLSADO'] actual_header = df.columns.values.tolist() if actual_header != expected_header: print u'Bad CSV: expected header %s, got %s' % (expected_header, actual_header) return archived_expense_list = [] objects_counter = 0 archived_expense_list_counter = len(df.index) for idx, row in df.iterrows(): name = row["SENADOR"] nature = row["TIPO_DESPESA"] cpf_cnpj = self.normalize_cnpj_or_cpf(row["CNPJ_CPF"]) supplier_name = row["FORNECEDOR"] docnumber = row["DOCUMENTO"] expense_date = row["DATA"] expensed = row['VALOR_REEMBOLSADO'] # FIXME: WTF? if isinstance(expensed, unicode): expensed = float(expensed.replace(',', '.').replace('\r\n', '')) nature, _ = ExpenseNature.objects.get_or_create(name=nature) try: supplier = Supplier.objects.get(identifier=cpf_cnpj) except Supplier.DoesNotExist: supplier = Supplier(identifier=cpf_cnpj, name=supplier_name) supplier.save() legislator, _ = self.try_name_disambiguation(name) if not legislator: legislator = Legislator.objects.get(name__iexact=name) mandate = self.mandate_for_legislator(legislator, None) expense = ArchivedExpense(number=docnumber, nature=nature, date=expense_date, expensed=expensed, mandate=mandate, supplier=supplier, collection_run=self.collection_run) archived_expense_list.append(expense) self.debug("New expense found: %s" % unicode(expense)) objects_counter += 1 archived_expense_list_counter -= 1 # We create a list with up to OBJECT_LIST_MAXIMUM_COUNTER. # If that lists is equal to the maximum object count allowed # or if there are no more objects in archived_expense_list, # we bulk_create() them and clear the list. if objects_counter == OBJECT_LIST_MAXIMUM_COUNTER or archived_expense_list_counter == 0: ArchivedExpense.objects.bulk_create(archived_expense_list) archived_expense_list[:] = [] objects_counter = 0 reset_queries() else: self.debug("Error downloading file for year %d" % year)
def update_data_for_year(self, year=datetime.now().year): self.debug("Updating data for year %d" % year) data = StringIO(self.retrieve_data_for_year(year)) if data: df = pd.read_csv(data, skiprows=1, delimiter=";", parse_dates=[7], decimal=',', error_bad_lines=False).dropna(how='all') expected_header = [ u'ANO', u'MES', u'SENADOR', u'TIPO_DESPESA', u'CNPJ_CPF', u'FORNECEDOR', u'DOCUMENTO', u'DATA', u'DETALHAMENTO', u'VALOR_REEMBOLSADO' ] actual_header = df.columns.values.tolist() if actual_header != expected_header: print u'Bad CSV: expected header %s, got %s' % ( expected_header, actual_header) return archived_expense_list = [] objects_counter = 0 archived_expense_list_counter = len(df.index) for idx, row in df.iterrows(): name = row["SENADOR"] nature = row["TIPO_DESPESA"] cpf_cnpj = row["CNPJ_CPF"].replace('.', '').replace( '-', '').replace('/', '') supplier_name = row["FORNECEDOR"] docnumber = row["DOCUMENTO"] expense_date = row["DATA"] expensed = row['VALOR_REEMBOLSADO'] nature, _ = ExpenseNature.objects.get_or_create(name=nature) try: supplier = Supplier.objects.get(identifier=cpf_cnpj) except Supplier.DoesNotExist: supplier = Supplier(identifier=cpf_cnpj, name=supplier_name) supplier.save() try: legislator = Legislator.objects.get(name__iexact=name) mandate = self.mandate_for_legislator(legislator, None) expense = ArchivedExpense( number=docnumber, nature=nature, date=expense_date, expensed=expensed, mandate=mandate, supplier=supplier, collection_run=self.collection_run) archived_expense_list.append(expense) self.debug("New expense found: %s" % unicode(expense)) objects_counter += 1 archived_expense_list_counter -= 1 # We create a list with up to OBJECT_LIST_MAXIMUM_COUNTER. # If that lists is equal to the maximum object count allowed # or if there are no more objects in archived_expense_list, # we bulk_create() them and clear the list. if objects_counter == OBJECT_LIST_MAXIMUM_COUNTER or archived_expense_list_counter == 0: ArchivedExpense.objects.bulk_create( archived_expense_list) archived_expense_list[:] = [] objects_counter = 0 reset_queries() except Exception: pass else: self.debug("Error downloading file for year %d" % year)
def update_data(self): if os.path.exists('cdep-collection-run'): crid = int(open('cdep-collection-run').read()) CollectionRun.objects.get(id=crid).delete() os.unlink('cdep-collection-run') self.collection_run = self.create_collection_run(self.legislature) data_path = os.path.join(os.getcwd(), 'data', 'cdep') files_to_download = ['AnoAtual.zip'] previous_years = date.today().year - self.legislature.date_start.year if previous_years: files_to_download.append('AnoAnterior.zip') if previous_years > 1: files_to_download.append('AnosAnteriores.zip') files_to_process = list() for file_name in files_to_download: xml_file_name = file_name.replace('zip', 'xml') full_xml_path = os.path.join(data_path, xml_file_name) files_to_process.append(os.path.join(data_path, full_xml_path)) full_path = os.path.join(data_path, file_name) headers = dict() if os.path.exists(full_path): headers['If-Modified-Since'] = http_date(os.path.getmtime(full_path), usegmt=True) uri = 'http://www.camara.gov.br/cotas/' + file_name self.debug(u"Preparing to download %s…" % (uri)) r = requests.get(uri, headers=headers, stream=True) if r.status_code == requests.codes.not_modified: self.debug(u"File %s not updated since last download, skipping…" % file_name) continue with open(full_path, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): if chunk: f.write(chunk) self.debug(u"Unzipping %s…" % (file_name)) zf = ZipFile(full_path, 'r') zf.extract(xml_file_name, data_path) open('cdep-collection-run', 'w').write('%d' % (self.collection_run.id)) archived_expense_list = [] for file_name in reversed(files_to_process): self.debug(u"Processing %s…" % file_name) objects_counter = 0 context = iterparse(file_name, events=("start", "end")) # turn it into an iterator context = iter(context) for event, elem in context: if event != "end" or elem.tag != "DESPESA": continue # Some entries lack numLegislatura, so we fallback to numAno. legislature_year = elem.find('nuLegislatura') if legislature_year is not None: legislature_year = int(legislature_year.text) else: legislature_year = int(elem.find('numAno').text) if legislature_year < self.legislature.date_start.year or \ legislature_year > self.legislature.date_end.year: legislature_year = None else: legislature_year = self.legislature.date_start.year if legislature_year != self.legislature.date_start.year: self.debug(u"Ignoring entry because it's out of the target legislature…") continue name = elem.find('txNomeParlamentar').text.title() nature = elem.find('txtDescricao').text.title() supplier_name = elem.find('txtBeneficiario') if supplier_name is not None: supplier_name = supplier_name.text.title() else: supplier_name = u'Sem nome' supplier_identifier = elem.find('txtCNPJCPF') if supplier_identifier is not None: supplier_identifier = self.normalize_cnpj_or_cpf(supplier_identifier.text) if not supplier_identifier: supplier_identifier = u'Sem CNPJ/CPF (%s)' % supplier_name try: supplier = Supplier.objects.get(identifier=supplier_identifier) except Supplier.DoesNotExist: supplier = Supplier(identifier=supplier_identifier, name=supplier_name) supplier.save() docnumber = elem.find('txtNumero').text expense_date = elem.find('datEmissao') if expense_date is not None: expense_date = date(*((int(x.lstrip('0')) for x in expense_date.text[:10].split('-')))) else: expense_year = int(elem.find('numAno').text) expense_month = int(elem.find('numMes').text) expense_date = date(expense_year, expense_month, 1) expensed = float(elem.find('vlrLiquido').text) nature, _ = ExpenseNature.objects.get_or_create(name=nature) party = party_name = elem.find('sgPartido') if party_name is not None: party_name = self._normalize_party_name(party_name.text) party, _ = PoliticalParty.objects.get_or_create(siglum=party_name) state = elem.find('sgUF').text original_id = elem.find('ideCadastro').text try: legislator = Legislator.objects.get(name__iexact=name) except Legislator.DoesNotExist: # Some legislators do are not listed in the other WS because they are not # in exercise. self.debug(u"Found legislator who's not in exercise: %s" % name) legislator = Legislator(name=name) legislator.save() mandate = self.mandate_for_legislator(legislator, party, state=state, original_id=original_id) expense = ArchivedExpense(number=docnumber, nature=nature, date=expense_date, expensed=expensed, mandate=mandate, supplier=supplier, collection_run=self.collection_run) archived_expense_list.append(expense) self.debug(u"New expense found: %s" % unicode(expense)) objects_counter += 1 if objects_counter == OBJECT_LIST_MAXIMUM_COUNTER: ArchivedExpense.objects.bulk_create(archived_expense_list) archived_expense_list[:] = [] objects_counter = 0 reset_queries() elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] continue if archived_expense_list: ArchivedExpense.objects.bulk_create(archived_expense_list) os.unlink('cdep-collection-run')
def update_data_for_year(self, year=datetime.now().year): self.debug("Updating data for year %d" % year) data = StringIO(self.retrieve_data_for_year(year)) if data: df = pd.read_csv(data, skiprows=1, delimiter=";", parse_dates=[7], decimal=',', error_bad_lines=False).dropna(how='all') expected_header = [u'ANO', u'MES', u'SENADOR', u'TIPO_DESPESA', u'CNPJ_CPF', u'FORNECEDOR', u'DOCUMENTO', u'DATA', u'DETALHAMENTO', u'VALOR_REEMBOLSADO'] actual_header = df.columns.values.tolist() if actual_header != expected_header: print u'Bad CSV: expected header %s, got %s' % (expected_header, actual_header) return archived_expense_list = [] objects_counter = 0 archived_expense_list_counter = len(df.index) for idx, row in df.iterrows(): name = row["SENADOR"] nature = row["TIPO_DESPESA"] cpf_cnpj = row["CNPJ_CPF"].replace('.', '').replace('-', '').replace('/', '') supplier_name = row["FORNECEDOR"] docnumber = row["DOCUMENTO"] expense_date = row["DATA"] expensed = row['VALOR_REEMBOLSADO'] nature, _ = ExpenseNature.objects.get_or_create(name=nature) try: supplier = Supplier.objects.get(identifier=cpf_cnpj) except Supplier.DoesNotExist: supplier = Supplier(identifier=cpf_cnpj, name=supplier_name) supplier.save() try: legislator = Legislator.objects.get(name__iexact=name) mandate = self.mandate_for_legislator(legislator, None) expense = ArchivedExpense(number=docnumber, nature=nature, date=expense_date, expensed=expensed, mandate=mandate, supplier=supplier, collection_run=self.collection_run) archived_expense_list.append(expense) self.debug("New expense found: %s" % unicode(expense)) objects_counter += 1 archived_expense_list_counter -= 1 # We create a list with up to OBJECT_LIST_MAXIMUM_COUNTER. # If that lists is equal to the maximum object count allowed # or if there are no more objects in archived_expense_list, # we bulk_create() them and clear the list. if objects_counter == OBJECT_LIST_MAXIMUM_COUNTER or archived_expense_list_counter == 0: ArchivedExpense.objects.bulk_create(archived_expense_list) archived_expense_list[:] = [] objects_counter = 0 reset_queries() except Exception: pass else: self.debug("Error downloading file for year %d" % year)
def update_data_for_year(self, year=datetime.now().year): self.debug("Updating data for year %d" % year) csv_data = self.retrieve_data_for_year(year).replace('\r\n', '\n') # FIXME: data containing quote-like characters (like ¨) break pandas parsing as well csv_data = csv_data.replace(u'¨', '') csv_data = re.sub(r'([^;\n])""+([^;\n])', r'\1"\2', csv_data) csv_data = re.sub(r'([^;\n])"([^;\n])', r'\1\2', csv_data) data = StringIO(csv_data) if data: df = pd.read_csv(data, skiprows=1, delimiter=";", parse_dates=[7], decimal=',', error_bad_lines=False, encoding='utf-8').dropna(how='all') expected_header = [ u'ANO', u'MES', u'SENADOR', u'TIPO_DESPESA', u'CNPJ_CPF', u'FORNECEDOR', u'DOCUMENTO', u'DATA', u'DETALHAMENTO', u'VALOR_REEMBOLSADO' ] actual_header = df.columns.values.tolist() if actual_header != expected_header: print u'Bad CSV: expected header %s, got %s' % ( expected_header, actual_header) return archived_expense_list = [] objects_counter = 0 archived_expense_list_counter = len(df.index) for idx, row in df.iterrows(): name = row["SENADOR"] nature = row["TIPO_DESPESA"] cpf_cnpj = self.normalize_cnpj_or_cpf(row["CNPJ_CPF"]) supplier_name = row["FORNECEDOR"] docnumber = row["DOCUMENTO"] expense_date = row["DATA"] expensed = row['VALOR_REEMBOLSADO'] # FIXME: WTF? if isinstance(expensed, unicode): expensed = float( expensed.replace(',', '.').replace('\r\n', '')) nature, _ = ExpenseNature.objects.get_or_create(name=nature) try: supplier = Supplier.objects.get(identifier=cpf_cnpj) except Supplier.DoesNotExist: supplier = Supplier(identifier=cpf_cnpj, name=supplier_name) supplier.save() legislator, _ = self.try_name_disambiguation(name) if not legislator: legislator = Legislator.objects.get(name__iexact=name) mandate = self.mandate_for_legislator(legislator, None) expense = ArchivedExpense(number=docnumber, nature=nature, date=expense_date, expensed=expensed, mandate=mandate, supplier=supplier, collection_run=self.collection_run) archived_expense_list.append(expense) self.debug("New expense found: %s" % unicode(expense)) objects_counter += 1 archived_expense_list_counter -= 1 # We create a list with up to OBJECT_LIST_MAXIMUM_COUNTER. # If that lists is equal to the maximum object count allowed # or if there are no more objects in archived_expense_list, # we bulk_create() them and clear the list. if objects_counter == OBJECT_LIST_MAXIMUM_COUNTER or archived_expense_list_counter == 0: ArchivedExpense.objects.bulk_create(archived_expense_list) archived_expense_list[:] = [] objects_counter = 0 reset_queries() else: self.debug("Error downloading file for year %d" % year)
def update_data_for_year(self, year): self.debug(u'Updating data for year {0}'.format(year)) try: csv_data = self.retrieve_data_for_year(year).replace('\r\n', '\n') except Exception: print u'Not found data for year {0}'.format(year) return # Skip first line head, tail = csv_data.split('\n', 1) self.debug(u'Reading file...') data = rows.import_from_csv(BytesIO(tail.encode('utf-8'))) if not data: self.debug(u'Error downloading file for year {0}'.format(year)) return expected_header = [ u'ano', u'mes', u'senador', u'tipo_despesa', u'cnpj_cpf', u'fornecedor', u'documento', u'data', u'detalhamento', u'valor_reembolsado', ] actual_header = data.fields.keys() if actual_header != expected_header: # FIXME print u'Bad CSV: expected header {0}, got {1}'.format( expected_header, actual_header) return archived_expense_list = [] objects_counter = 0 archived_expense_list_counter = len(data) legislators = {} mandates = {} natures = {} for row in data: if not row.senador: self.debug(u'Error downloading file for year {0}') continue if not row.data: date = '01/{0}/{1}'.format(row.mes, row.ano) expense_date = datetime.strptime(date, '%d/%m/%Y') else: expense_date = datetime.strptime(row.data, '%d/%m/%Y') name = self._normalize_name(row.senador) nature = row.tipo_despesa cpf_cnpj = self.normalize_cnpj_or_cpf(row.cnpj_cpf) supplier_name = row.fornecedor docnumber = row.documento expensed = row.valor_reembolsado # FIXME: WTF? if isinstance(expensed, unicode): expensed = float( expensed.replace(',', '.').replace('\r', '').replace('\n', '')) # memory cache expense_nature = natures.get(nature) if not expense_nature: expense_nature, _ = ExpenseNature.objects.get_or_create( name=nature) natures[nature] = expense_nature try: supplier = Supplier.objects.get(identifier=cpf_cnpj) except Supplier.DoesNotExist: supplier = Supplier(identifier=cpf_cnpj, name=supplier_name) supplier.save() self.debug(u'New supplier found: {0}'.format( unicode(supplier))) # memory cache legislator = legislators.get(name) if not legislator: legislator = self._get_or_create_legislator(name) legislators[name] = legislator # memory cache mandate = mandates.get(name) if not mandate: mandate = self.mandate_for_legislator(legislator, None) mandates[name] = mandate expense = ArchivedExpense(number=docnumber, nature=expense_nature, date=expense_date, expensed=expensed, mandate=mandate, supplier=supplier, collection_run=self.collection_run) archived_expense_list.append(expense) self.debug(u'New expense found: {0}'.format(unicode(expense))) objects_counter += 1 archived_expense_list_counter -= 1 # We create a list with up to OBJECT_LIST_MAXIMUM_COUNTER. # If that lists is equal to the maximum object count allowed # or if there are no more objects in archived_expense_list, # we bulk_create() them and clear the list. if objects_counter == OBJECT_LIST_MAXIMUM_COUNTER or archived_expense_list_counter == 0: ArchivedExpense.objects.bulk_create(archived_expense_list) archived_expense_list[:] = [] objects_counter = 0 reset_queries()