def define_sheet_name(self, name):
        """Fix a sheet name"""

        if name not in SHEET_INFO:
            raise ValueError(f"Invalid sheet name {repr(name)}.")

        if name in self.sheet_names:
            return name

        # Try slugging the names, compare and fix (if found)
        name_slug = slug(name)
        for sheet_name in self.sheet_names:
            if slug(sheet_name) == name_slug:
                logging.info(
                    f"Using {repr(sheet_name)} instead of {repr(name)} on {self.relative_filename}"
                )
                return sheet_name

        # First try didn't work
        try:
            new_name = self.sheet_names[list(SHEET_INFO.keys()).index(name)]
        except IndexError:
            logging.error(
                f"Sheet {repr(name)} not found on {self.relative_filename}")
            return None

        return new_name
Beispiel #2
0
    def __setitem__(self, key, value):
        key_type = type(key)
        if key_type == int:
            self._rows[key] = self._make_row(value)
        elif key_type == unicode:  # TODO: change to 'str' on Python3
            values = list(value)  # I'm not lazy, sorry
            if len(values) != len(self):
                raise ValueError('Values length ({}) should be the same as '
                                 'Table length ({})'
                                 .format(len(values), len(self)))

            from rows.fields import detect_types
            from rows.utils import slug

            field_name = slug(key)
            is_new_field = field_name not in self.field_names
            field_type = detect_types([field_name],
                    [[value] for value in values])[field_name]
            self.fields[field_name] = field_type
            self.Row = namedtuple('Row', self.field_names)

            if is_new_field:
                for row, value in zip(self._rows, values):
                    row.append(field_type.deserialize(value))
            else:
                field_index = self.field_names.index(field_name)
                for row, value in zip(self._rows, values):
                    row[field_index] = field_type.deserialize(value)
        else:
            raise ValueError('Unsupported key type: {}'
                    .format(type(key).__name__))
def extract_servidor(filename):
    def load_data(filename):
        wb = openpyxl.load_workbook(filename, data_only=True, read_only=True)
        sheet = wb.active
        data, started = [], False
        for row in tqdm(sheet.rows, desc=filename):
            row = [str(cell.value or "") for cell in row[:8]]
            if row[0].lower().startswith("emitido"):
                continue
            if started or (not started and row[0] == "CPF do beneficiado"):
                started = True
                yield row

    data = load_data(filename)
    header = [slug(field_name) for field_name in next(data)]
    for row in data:
        row = dict(zip(header, row))
        row["valor"] = ISOOrBrazilianDecimalField.deserialize(row["valor"])
        row["data_inicio_da_percepcao"] = ISOOrBrazilianDateField.deserialize(
            row["data_inicio_da_percepcao"])
        row["data_de_publicacao_da_portaria"] = ISOOrBrazilianDateField.deserialize(
            row["data_de_publicacao_da_portaria"])
        row["mes_de_referencia"] = MonthDateField.deserialize(
            row["mes_de_referencia"])
        if set(row["cpf_do_beneficiado"]) == {"*"}:
            row["cpf_do_beneficiado"] = None
        yield row
def migrate_usernames(filepath):
    with open(filepath, mode="w") as fobj:
        writer = csv.DictWriter(
            fobj, fieldnames=["old_username", "new_username", "email"])
        writer.writeheader()
        for user in User.objects.all():
            if is_valid_username(user.username):
                continue

            # Define possible usernames based on current and remove any
            # non-allowed chars
            possible = [
                slug(username, permitted_chars=possible_chars)
                for username in possible_usernames(user.username, user.email)
            ]
            for username in possible:
                if not User.objects.filter(username=username).exists():
                    writer.writerow({
                        "old_username": user.username,
                        "new_username": username,
                        "email": user.email
                    })
                    user.username = username
                    user.save()
                    break
            print(
                f"ERROR: could not migrate {user} (tried: {', '.join(possible)})"
            )
Beispiel #5
0
    def __setitem__(self, key, value):
        key_type = type(key)
        if key_type == int:
            self._rows[key] = self._make_row(value)
        elif key_type == unicode:  # TODO: change to 'str' on Python3
            values = list(value)  # I'm not lazy, sorry
            if len(values) != len(self):
                raise ValueError('Values length ({}) should be the same as '
                                 'Table length ({})'.format(
                                     len(values), len(self)))

            from rows.fields import detect_types
            from rows.utils import slug

            field_name = slug(key)
            is_new_field = field_name not in self.field_names
            field_type = detect_types([field_name],
                                      [[value]
                                       for value in values])[field_name]
            self.fields[field_name] = field_type
            self.Row = namedtuple('Row', self.field_names)

            if is_new_field:
                for row, value in zip(self._rows, values):
                    row.append(field_type.deserialize(value))
            else:
                field_index = self.field_names.index(field_name)
                for row, value in zip(self._rows, values):
                    row[field_index] = field_type.deserialize(value)
        else:
            raise ValueError('Unsupported key type: {}'.format(
                type(key).__name__))
Beispiel #6
0
 def classify(self, name):
     if not self.cache:
         raise RuntimeError(
             "Classification cache not loaded (must call .load())")
     name = slug(name).split("_")[0].upper()
     result = self.cache.get(name, None)
     if result in (None, ""):
         return None
     else:
         return result
    def make_month_request(self, year, month, force_url=None):
        if force_url is None:
            url = self.month_url.format(
                month_slug=slug(utils.MONTHS[month - 1]), year=year
            )
        else:
            url = force_url

        return scrapy.Request(
            url=url, meta={"year": year, "month": month}, callback=self.parse_month
        )
    def general_metadata(self):
        """Get court, reference and publication month from Contracheque sheet"""

        # First, build the dict
        meta = {}
        for index, row in enumerate(self.sheet_rows("Contracheque")):
            if "CPF" in row or "Nome" in row:
                end_row = index - 1
                break
        sheet_name = self.define_sheet_name("Contracheque")
        table = self.read_data(sheet_name=sheet_name, end_row=end_row)
        for row in table:
            values = list(row._asdict().values())
            if "-" not in (values[0] or ""):
                non_empty_values = []
                for value in values:
                    if value and value not in non_empty_values:
                        non_empty_values.append(value)
                if non_empty_values and len(non_empty_values) >= 2:
                    meta[slug(non_empty_values[0])] = non_empty_values[1]

        # Check, convert and rename if needed

        # Court name
        court_from_metadata = utils.fix_tribunal(
            self.file_metadata["tribunal"])
        court = utils.fix_tribunal(meta.pop("orgao", ""))
        if utils.is_court_name_equivalent(court_from_metadata, court):
            court = court_from_metadata
        else:
            # TODO: may not procceed
            logging.warning(
                f"orgao from metadata ({repr(court or None)}) different from file metadata ({repr(court_from_metadata)}) on {self.relative_filename}"
            )
        # Using same court named from download page to maintain consistency
        # (court names from there are more correct in general and will make
        # joins between tables easily).
        meta["tribunal"] = court

        # Reference month
        reference_month = str(meta.pop("mesano_de_referencia", None) or "")
        reference_from_metadata = (
            f"{self.file_metadata['ano']}-{self.file_metadata['mes']:02d}-01")
        if not reference_month:
            meta["mes_ano_de_referencia"] = reference_from_metadata
        elif regexp_date.match(reference_month):
            parts = reference_month.split("-")
            meta["mes_ano_de_referencia"] = f"{parts[0]}-{parts[1]}-01"
        elif " de " in reference_month.lower():
            month, year = reference_month.lower().split(" de ")
            if len(year) == 2:
                year = f"20{year}"
            month = utils.MONTHS.index(slug(month)) + 1
            meta["mes_ano_de_referencia"] = f"{year}-{month:02d}-01"
        elif "/" in reference_month:
            month, year = reference_month.split("/")
            if len(year) == 2:
                year = f"20{year}"
            meta["mes_ano_de_referencia"] = f"{year}-{int(month):02d}-01"
        else:
            logging.error(
                f"Can't parse mes_ano_de_referencia ({repr(reference_month)}) in {self.relative_filename}"
            )
            meta["mes_ano_de_referencia"] = reference_from_metadata
        if meta["mes_ano_de_referencia"] != reference_from_metadata:
            logging.warning(
                f"mes_ano_de_referencia ({repr(reference_month)}) different from file metadata ({repr(reference_from_metadata)})"
            )

        # Publication date
        publication_date = meta.get("data_de_publicacao", None)
        if isinstance(publication_date, str):
            if not regexp_date.match(publication_date):
                if publication_date.count("/") == 2:
                    day, month, year = publication_date.split("/")
                    if len(year) == 2:
                        year = f"20{year}"
                    meta[
                        "data_de_publicacao"] = f"{year}-{int(month):02d}-{int(day):02d}"
                else:
                    logging.error(
                        f"Can't parse data_de_publicacao ({repr(publication_date)}) from {self.relative_filename}"
                    )
            else:
                meta["data_de_publicacao"] = publication_date.split()[0]
        else:
            if publication_date is not None:
                logging.error(
                    f"Can't parse data_de_publicacao ({repr(publication_date)}) and it's not None from {self.relative_filename}"
                )
            meta["data_de_publicacao"] = None
        if "T" in str(meta["data_de_publicacao"] or ""):
            # Got as datetime
            meta["data_de_publicacao"] = meta["data_de_publicacao"].split(
                "T")[0]
        elif str(meta["data_de_publicacao"] or "").isdigit():
            # Filled incorrectly
            meta["data_de_publicacao"] = None

        for key in list(meta.keys()):
            if key not in ("data_de_publicacao", "tribunal",
                           "mes_ano_de_referencia"):
                del meta[key]
                logging.warning(
                    f"Ignoring invalid general_metadata key {repr(key)} from {self.relative_filename}"
                )

        return meta
def fix_header(sheet_name, header):
    name = sheet_name
    if "-" in name:
        name = name.split(" - ")[1]
    sheet_slug = slug(name)
    new_header = []
    for value in header:
        field_name = slug(regexp_parenthesis.sub("", value or "").strip())
        if (value is None or "deverao_ser_preenchidos" in field_name
                or "observacao" in field_name
                or field_name in ("sjsp", "trf", "sjms")):
            break

        field_name = (field_name.replace(
            "vant_art_", "vantagens_artigo_").replace(
                "vantagens_art_", "vantagens_artigo_").replace(
                    "outra_1_dirpes",
                    "outra").replace("outra_2_dirpes", "outra_2").replace(
                        "outra_1_direvent", "outra").replace(
                            "outra_2_direvent",
                            "outra_2").replace("outra_1", "outra").replace(
                                "gratificacao_presidencia",
                                "gratificacao_de_presidencia").replace(
                                    "vantagens_eventuavs_",
                                    "vantagens_eventuais_").replace(
                                        "auxilioalimentacao",
                                        "auxilio_alimentacao").replace(
                                            "auxilio_preescolar",
                                            "auxilio_pre_escolar").replace(
                                                "correcao_monetariajuros",
                                                "correcao_monetaria_juros").
                      replace("vantagens_eventuais",
                              "direitos_eventuais").replace(
                                  "vantagens_pessoais", "direitos_pessoais"))
        if field_name.startswith(sheet_slug):
            field_name = field_name[len(sheet_slug):]
        elif field_name.startswith("vantagens_eventuais_"):
            field_name = field_name[len("vantagens_eventuais_"):]
        elif field_name in (
                "subsidio_total_de",
                "subsidio_outra",
                "subsidio_outra_detalhe",
        ):
            field_name = field_name.replace("subsidio_", "")
        field_name = slug(field_name)

        if field_name.endswith(sheet_slug):
            field_name = field_name[:-(len(sheet_slug))]
        elif field_name.endswith("_vantagens_pessoais"):
            field_name = field_name[:-(len("_vantagens_pessoais"))]
        field_name = slug(field_name)

        if field_name in ("total_de", "total_de_"):
            field_name = "total"
        elif field_name == "cargo_origem":
            field_name = "cargo_de_origem"
        elif field_name == "outra_detalhe":
            field_name = "detalhe"
        elif field_name == "outra_pae":
            field_name = "parcela_autonoma_de_equivalencia"
        elif field_name == "previdencia_publica":
            field_name = "descontos_previdencia_publica"
        elif field_name == "vantagens_artigo_184_e_192_lei_171152":
            field_name = "vantagens_artigo_184_i_e_192_i_lei_171152"
        elif field_name == "abono_constitucional_de_1_3_de_ferias":
            field_name = "abono_constitucional_de_13_de_ferias"
        elif field_name == "gratificacao_por_encargo_cursoconcurso":
            field_name = "gratificacao_por_encargo_curso_concurso"

        new_header.append(field_name)
    header = make_header(new_header)

    schema = SHEET_INFO[sheet_name]["schema"]
    reference_header = list(schema.keys())
    diff1 = set(reference_header) - set(header)
    diff2 = set(header) - set(reference_header)

    for field_name, field_type in schema.items():
        if field_name in diff1 and field_type.optional:
            diff1.remove(field_name)
    if diff1 or diff2:
        if len(diff1) > 1 or len(diff2) > 1 or len(diff1) != len(diff2):
            raise ValueError(
                f"Invalid header: {header} (expected: {reference_header}). A - B: {diff2}, B - A: {diff1}"
            )
        header[header.index(diff2.pop())] = diff1.pop()

    return header
Beispiel #10
0
 def test_slug(self):
     self.assertEqual(slug('Álvaro Justen'), 'alvaro_justen')
     self.assertEqual(slug("Moe's Bar"), 'moes_bar')
     self.assertEqual(slug("-----te-----st------"), 'te_st')
Beispiel #11
0
 def test_slug(self):
     self.assertEqual(slug('Álvaro Justen'), 'alvaro_justen')
     self.assertEqual(slug("Moe's Bar"), 'moes_bar')
     self.assertEqual(slug("-----te-----st------"), 'te_st')