def define_sheet_name(self, name): """Fix a sheet name""" if name not in SHEET_INFO: raise ValueError(f"Invalid sheet name {repr(name)}.") if name in self.sheet_names: return name # Try slugging the names, compare and fix (if found) name_slug = slug(name) for sheet_name in self.sheet_names: if slug(sheet_name) == name_slug: logging.info( f"Using {repr(sheet_name)} instead of {repr(name)} on {self.relative_filename}" ) return sheet_name # First try didn't work try: new_name = self.sheet_names[list(SHEET_INFO.keys()).index(name)] except IndexError: logging.error( f"Sheet {repr(name)} not found on {self.relative_filename}") return None return new_name
def __setitem__(self, key, value): key_type = type(key) if key_type == int: self._rows[key] = self._make_row(value) elif key_type == unicode: # TODO: change to 'str' on Python3 values = list(value) # I'm not lazy, sorry if len(values) != len(self): raise ValueError('Values length ({}) should be the same as ' 'Table length ({})' .format(len(values), len(self))) from rows.fields import detect_types from rows.utils import slug field_name = slug(key) is_new_field = field_name not in self.field_names field_type = detect_types([field_name], [[value] for value in values])[field_name] self.fields[field_name] = field_type self.Row = namedtuple('Row', self.field_names) if is_new_field: for row, value in zip(self._rows, values): row.append(field_type.deserialize(value)) else: field_index = self.field_names.index(field_name) for row, value in zip(self._rows, values): row[field_index] = field_type.deserialize(value) else: raise ValueError('Unsupported key type: {}' .format(type(key).__name__))
def extract_servidor(filename): def load_data(filename): wb = openpyxl.load_workbook(filename, data_only=True, read_only=True) sheet = wb.active data, started = [], False for row in tqdm(sheet.rows, desc=filename): row = [str(cell.value or "") for cell in row[:8]] if row[0].lower().startswith("emitido"): continue if started or (not started and row[0] == "CPF do beneficiado"): started = True yield row data = load_data(filename) header = [slug(field_name) for field_name in next(data)] for row in data: row = dict(zip(header, row)) row["valor"] = ISOOrBrazilianDecimalField.deserialize(row["valor"]) row["data_inicio_da_percepcao"] = ISOOrBrazilianDateField.deserialize( row["data_inicio_da_percepcao"]) row["data_de_publicacao_da_portaria"] = ISOOrBrazilianDateField.deserialize( row["data_de_publicacao_da_portaria"]) row["mes_de_referencia"] = MonthDateField.deserialize( row["mes_de_referencia"]) if set(row["cpf_do_beneficiado"]) == {"*"}: row["cpf_do_beneficiado"] = None yield row
def migrate_usernames(filepath): with open(filepath, mode="w") as fobj: writer = csv.DictWriter( fobj, fieldnames=["old_username", "new_username", "email"]) writer.writeheader() for user in User.objects.all(): if is_valid_username(user.username): continue # Define possible usernames based on current and remove any # non-allowed chars possible = [ slug(username, permitted_chars=possible_chars) for username in possible_usernames(user.username, user.email) ] for username in possible: if not User.objects.filter(username=username).exists(): writer.writerow({ "old_username": user.username, "new_username": username, "email": user.email }) user.username = username user.save() break print( f"ERROR: could not migrate {user} (tried: {', '.join(possible)})" )
def __setitem__(self, key, value): key_type = type(key) if key_type == int: self._rows[key] = self._make_row(value) elif key_type == unicode: # TODO: change to 'str' on Python3 values = list(value) # I'm not lazy, sorry if len(values) != len(self): raise ValueError('Values length ({}) should be the same as ' 'Table length ({})'.format( len(values), len(self))) from rows.fields import detect_types from rows.utils import slug field_name = slug(key) is_new_field = field_name not in self.field_names field_type = detect_types([field_name], [[value] for value in values])[field_name] self.fields[field_name] = field_type self.Row = namedtuple('Row', self.field_names) if is_new_field: for row, value in zip(self._rows, values): row.append(field_type.deserialize(value)) else: field_index = self.field_names.index(field_name) for row, value in zip(self._rows, values): row[field_index] = field_type.deserialize(value) else: raise ValueError('Unsupported key type: {}'.format( type(key).__name__))
def classify(self, name): if not self.cache: raise RuntimeError( "Classification cache not loaded (must call .load())") name = slug(name).split("_")[0].upper() result = self.cache.get(name, None) if result in (None, ""): return None else: return result
def make_month_request(self, year, month, force_url=None): if force_url is None: url = self.month_url.format( month_slug=slug(utils.MONTHS[month - 1]), year=year ) else: url = force_url return scrapy.Request( url=url, meta={"year": year, "month": month}, callback=self.parse_month )
def general_metadata(self): """Get court, reference and publication month from Contracheque sheet""" # First, build the dict meta = {} for index, row in enumerate(self.sheet_rows("Contracheque")): if "CPF" in row or "Nome" in row: end_row = index - 1 break sheet_name = self.define_sheet_name("Contracheque") table = self.read_data(sheet_name=sheet_name, end_row=end_row) for row in table: values = list(row._asdict().values()) if "-" not in (values[0] or ""): non_empty_values = [] for value in values: if value and value not in non_empty_values: non_empty_values.append(value) if non_empty_values and len(non_empty_values) >= 2: meta[slug(non_empty_values[0])] = non_empty_values[1] # Check, convert and rename if needed # Court name court_from_metadata = utils.fix_tribunal( self.file_metadata["tribunal"]) court = utils.fix_tribunal(meta.pop("orgao", "")) if utils.is_court_name_equivalent(court_from_metadata, court): court = court_from_metadata else: # TODO: may not procceed logging.warning( f"orgao from metadata ({repr(court or None)}) different from file metadata ({repr(court_from_metadata)}) on {self.relative_filename}" ) # Using same court named from download page to maintain consistency # (court names from there are more correct in general and will make # joins between tables easily). meta["tribunal"] = court # Reference month reference_month = str(meta.pop("mesano_de_referencia", None) or "") reference_from_metadata = ( f"{self.file_metadata['ano']}-{self.file_metadata['mes']:02d}-01") if not reference_month: meta["mes_ano_de_referencia"] = reference_from_metadata elif regexp_date.match(reference_month): parts = reference_month.split("-") meta["mes_ano_de_referencia"] = f"{parts[0]}-{parts[1]}-01" elif " de " in reference_month.lower(): month, year = reference_month.lower().split(" de ") if len(year) == 2: year = f"20{year}" month = utils.MONTHS.index(slug(month)) + 1 meta["mes_ano_de_referencia"] = f"{year}-{month:02d}-01" elif "/" in reference_month: month, year = reference_month.split("/") if len(year) == 2: year = f"20{year}" meta["mes_ano_de_referencia"] = f"{year}-{int(month):02d}-01" else: logging.error( f"Can't parse mes_ano_de_referencia ({repr(reference_month)}) in {self.relative_filename}" ) meta["mes_ano_de_referencia"] = reference_from_metadata if meta["mes_ano_de_referencia"] != reference_from_metadata: logging.warning( f"mes_ano_de_referencia ({repr(reference_month)}) different from file metadata ({repr(reference_from_metadata)})" ) # Publication date publication_date = meta.get("data_de_publicacao", None) if isinstance(publication_date, str): if not regexp_date.match(publication_date): if publication_date.count("/") == 2: day, month, year = publication_date.split("/") if len(year) == 2: year = f"20{year}" meta[ "data_de_publicacao"] = f"{year}-{int(month):02d}-{int(day):02d}" else: logging.error( f"Can't parse data_de_publicacao ({repr(publication_date)}) from {self.relative_filename}" ) else: meta["data_de_publicacao"] = publication_date.split()[0] else: if publication_date is not None: logging.error( f"Can't parse data_de_publicacao ({repr(publication_date)}) and it's not None from {self.relative_filename}" ) meta["data_de_publicacao"] = None if "T" in str(meta["data_de_publicacao"] or ""): # Got as datetime meta["data_de_publicacao"] = meta["data_de_publicacao"].split( "T")[0] elif str(meta["data_de_publicacao"] or "").isdigit(): # Filled incorrectly meta["data_de_publicacao"] = None for key in list(meta.keys()): if key not in ("data_de_publicacao", "tribunal", "mes_ano_de_referencia"): del meta[key] logging.warning( f"Ignoring invalid general_metadata key {repr(key)} from {self.relative_filename}" ) return meta
def fix_header(sheet_name, header): name = sheet_name if "-" in name: name = name.split(" - ")[1] sheet_slug = slug(name) new_header = [] for value in header: field_name = slug(regexp_parenthesis.sub("", value or "").strip()) if (value is None or "deverao_ser_preenchidos" in field_name or "observacao" in field_name or field_name in ("sjsp", "trf", "sjms")): break field_name = (field_name.replace( "vant_art_", "vantagens_artigo_").replace( "vantagens_art_", "vantagens_artigo_").replace( "outra_1_dirpes", "outra").replace("outra_2_dirpes", "outra_2").replace( "outra_1_direvent", "outra").replace( "outra_2_direvent", "outra_2").replace("outra_1", "outra").replace( "gratificacao_presidencia", "gratificacao_de_presidencia").replace( "vantagens_eventuavs_", "vantagens_eventuais_").replace( "auxilioalimentacao", "auxilio_alimentacao").replace( "auxilio_preescolar", "auxilio_pre_escolar").replace( "correcao_monetariajuros", "correcao_monetaria_juros"). replace("vantagens_eventuais", "direitos_eventuais").replace( "vantagens_pessoais", "direitos_pessoais")) if field_name.startswith(sheet_slug): field_name = field_name[len(sheet_slug):] elif field_name.startswith("vantagens_eventuais_"): field_name = field_name[len("vantagens_eventuais_"):] elif field_name in ( "subsidio_total_de", "subsidio_outra", "subsidio_outra_detalhe", ): field_name = field_name.replace("subsidio_", "") field_name = slug(field_name) if field_name.endswith(sheet_slug): field_name = field_name[:-(len(sheet_slug))] elif field_name.endswith("_vantagens_pessoais"): field_name = field_name[:-(len("_vantagens_pessoais"))] field_name = slug(field_name) if field_name in ("total_de", "total_de_"): field_name = "total" elif field_name == "cargo_origem": field_name = "cargo_de_origem" elif field_name == "outra_detalhe": field_name = "detalhe" elif field_name == "outra_pae": field_name = "parcela_autonoma_de_equivalencia" elif field_name == "previdencia_publica": field_name = "descontos_previdencia_publica" elif field_name == "vantagens_artigo_184_e_192_lei_171152": field_name = "vantagens_artigo_184_i_e_192_i_lei_171152" elif field_name == "abono_constitucional_de_1_3_de_ferias": field_name = "abono_constitucional_de_13_de_ferias" elif field_name == "gratificacao_por_encargo_cursoconcurso": field_name = "gratificacao_por_encargo_curso_concurso" new_header.append(field_name) header = make_header(new_header) schema = SHEET_INFO[sheet_name]["schema"] reference_header = list(schema.keys()) diff1 = set(reference_header) - set(header) diff2 = set(header) - set(reference_header) for field_name, field_type in schema.items(): if field_name in diff1 and field_type.optional: diff1.remove(field_name) if diff1 or diff2: if len(diff1) > 1 or len(diff2) > 1 or len(diff1) != len(diff2): raise ValueError( f"Invalid header: {header} (expected: {reference_header}). A - B: {diff2}, B - A: {diff1}" ) header[header.index(diff2.pop())] = diff1.pop() return header
def test_slug(self): self.assertEqual(slug('Álvaro Justen'), 'alvaro_justen') self.assertEqual(slug("Moe's Bar"), 'moes_bar') self.assertEqual(slug("-----te-----st------"), 'te_st')