Ejemplo n.º 1
0
def get_psql_copy_command(table_name,
                          header,
                          encoding,
                          user=None,
                          password=None,
                          host=None,
                          port=None,
                          database_name=None,
                          database_uri=None,
                          dialect=csv.excel):

    table_name = slug(table_name)
    header = ', '.join(slug(field_name) for field_name in header)
    copy = ("\copy {table_name} ({header}) FROM STDIN "
            "DELIMITER '{delimiter}' "
            "QUOTE '{quote}' "
            "ENCODING '{encoding}' "
            "CSV HEADER;").format(table_name=table_name,
                                  header=header,
                                  delimiter=dialect.delimiter.replace(
                                      "'", "\\'"),
                                  quote=dialect.quotechar.replace("'", "\\'"),
                                  encoding=encoding)

    return get_psql_command(copy,
                            user=user,
                            password=password,
                            host=host,
                            port=port,
                            database_name=database_name,
                            database_uri=database_uri)
Ejemplo n.º 2
0
 def test_slug(self):
     self.assertEqual(plugins_utils.slug('Álvaro Justen'), 'alvaro_justen')
     self.assertEqual(plugins_utils.slug("Moe's Bar"), 'moes_bar')
     self.assertEqual(plugins_utils.slug("-----te-----st------"), 'te_st')
     # As in <https://github.com/turicas/rows/issues/179>
     self.assertEqual(
             plugins_utils.slug('Query Occurrence"( % ),"First Seen'),
             'query_occurrence_first_seen')
     self.assertEqual(plugins_utils.slug(' ÁLVARO  justen% '),
                      'alvaro_justen')
Ejemplo n.º 3
0
 def test_slug(self):
     self.assertEqual(plugins_utils.slug('Álvaro Justen'), 'alvaro_justen')
     self.assertEqual(plugins_utils.slug("Moe's Bar"), 'moes_bar')
     self.assertEqual(plugins_utils.slug("-----te-----st------"), 'te_st')
     # As in <https://github.com/turicas/rows/issues/179>
     self.assertEqual(
         plugins_utils.slug('Query Occurrence"( % ),"First Seen'),
         'query_occurrence_first_seen')
     self.assertEqual(plugins_utils.slug(' ÁLVARO  justen% '),
                      'alvaro_justen')
Ejemplo n.º 4
0
def get_psql_copy_command(
    table_name,
    header,
    encoding="utf-8",
    user=None,
    password=None,
    host=None,
    port=None,
    database_name=None,
    database_uri=None,
    dialect=csv.excel,
    direction="FROM",
):

    direction = direction.upper()
    if direction not in ("FROM", "TO"):
        raise ValueError('`direction` must be "FROM" or "TO"')

    table_name = slug(table_name)
    if header is None:
        header = ""
    else:
        header = ", ".join(slug(field_name) for field_name in header)
        header = "({header}) ".format(header=header)
    copy = (
        "\copy {table_name} {header}{direction} STDIN "
        "DELIMITER '{delimiter}' "
        "QUOTE '{quote}' "
        "ENCODING '{encoding}' "
        "CSV HEADER;"
    ).format(
        table_name=table_name,
        header=header,
        direction=direction,
        delimiter=dialect.delimiter.replace("'", "''"),
        quote=dialect.quotechar.replace("'", "''"),
        encoding=encoding,
    )

    return get_psql_command(
        copy,
        user=user,
        password=password,
        host=host,
        port=port,
        database_name=database_name,
        database_uri=database_uri,
    )
Ejemplo n.º 5
0
    def __setitem__(self, key, value):
        key_type = type(key)
        if key_type == int:
            self._rows[key] = self._make_row(value)
        elif key_type is six.text_type:
            from rows import fields
            from rows.plugins import utils

            values = list(value)  # I'm not lazy, sorry
            if len(values) != len(self):
                raise ValueError(
                    "Values length ({}) should be the same as "
                    "Table length ({})".format(len(values), len(self))
                )

            field_name = utils.slug(key)
            is_new_field = field_name not in self.field_names
            field_type = fields.detect_types(
                [field_name], [[value] for value in values]
            )[field_name]
            self.fields[field_name] = field_type
            self.Row = namedtuple("Row", self.field_names)

            if is_new_field:
                for row, value in zip(self._rows, values):
                    row.append(field_type.deserialize(value))
            else:
                field_index = self.field_names.index(field_name)
                for row, value in zip(self._rows, values):
                    row[field_index] = field_type.deserialize(value)
        else:
            raise ValueError("Unsupported key type: {}".format(type(key).__name__))
Ejemplo n.º 6
0
    def __setitem__(self, key, value):
        key_type = type(key)
        if key_type == int:
            self._rows[key] = self._make_row(value)
        elif key_type is six.text_type:
            from rows import fields
            from rows.plugins import utils

            values = list(value)  # I'm not lazy, sorry
            if len(values) != len(self):
                raise ValueError('Values length ({}) should be the same as '
                                 'Table length ({})'.format(
                                     len(values), len(self)))

            field_name = utils.slug(key)
            is_new_field = field_name not in self.field_names
            field_type = fields.detect_types([field_name],
                                             [[value]
                                              for value in values])[field_name]
            self.fields[field_name] = field_type
            self.Row = namedtuple('Row', self.field_names)

            if is_new_field:
                for row, value in zip(self._rows, values):
                    row.append(field_type.deserialize(value))
            else:
                field_index = self.field_names.index(field_name)
                for row, value in zip(self._rows, values):
                    row[field_index] = field_type.deserialize(value)
        else:
            raise ValueError('Unsupported key type: {}'.format(
                type(key).__name__))
Ejemplo n.º 7
0
    def parse_event(self, response):
        meta = response.request.meta

        data = meta["data"].copy()
        keys = [slug(key) for key in response.xpath("(//ul[@class='details-list'])[1]//li/text()").extract()]
        values = response.xpath("(//ul[@class='details-list'])[1]//li/span/text()").extract()
        data.update(dict(zip(keys, values)))
        ticket = response.xpath("(//a[contains(@href, 'ingresso.festivaldecuritiba.com.br')])[1]/@href").extract()
        data["ticket_url"] = ticket[0] if ticket else None
        image = response.xpath("//div[@class='about-image gallery-content']//img/@src").extract()
        data["image_url"] = image[0] if image else None
        data["schedule"] = "|".join(response.xpath("//div[@class='event-schedules']//p//text()").extract())
        data["description"] = "\n".join(response.xpath("//div[@class='event-description']//p//text()").extract()).replace("\xa0", " ")
        cast = []
        for line in response.xpath("//div[@class='event-datasheet']//p"):
            cast.append(" ".join(line.xpath(".//text()").extract()).replace("  ", " "))
        data["cast"] = "|".join(cast)

        translate_keys = [
            ("evento", "category"),
            ("genero", "genre"),
            ("classificacao", "classification"),
            ("duracao", "duration"),
            ("valor", "value"),
        ]
        for original, new in translate_keys:
            if original in data:
                data[new] = data.pop(original)

        yield data
Ejemplo n.º 8
0
    def name(self):
        '''Define table name based on its metadata (filename used on import)

        If `filename` is not available, return `table1`.'''
        from rows.plugins import utils

        # TODO: may try read meta['name'] also (some plugins may set it)
        name = os.path.basename(self.meta.get('filename', 'table1'))
        return utils.slug(os.path.splitext(name)[0])
Ejemplo n.º 9
0
    def name(self):
        """Define table name based on its metadata (filename used on import)

        If `filename` is not available, return `table1`.
        """

        from rows.plugins import utils

        # TODO: may try read meta['name'] also (some plugins may set it)
        name = os.path.basename(self.meta.get("filename", "table1"))
        return utils.slug(os.path.splitext(name)[0])
Ejemplo n.º 10
0
 def test_slug(self):
     self.assertEqual(plugins_utils.slug(None), "")
     self.assertEqual(plugins_utils.slug("Álvaro Justen"), "alvaro_justen")
     self.assertEqual(plugins_utils.slug("Moe's Bar"), "moes_bar")
     self.assertEqual(plugins_utils.slug("-----te-----st------"), "te_st")
     # As in <https://github.com/turicas/rows/issues/179>
     self.assertEqual(
         plugins_utils.slug('Query Occurrence"( % ),"First Seen'),
         "query_occurrence_first_seen",
     )
     self.assertEqual(plugins_utils.slug(" ÁLVARO  justen% "), "alvaro_justen")
     self.assertEqual(plugins_utils.slug(42), "42")
Ejemplo n.º 11
0
    def extract(self):
        header, empty_lines = None, 0
        wb = load_workbook(self.filename, read_only=True, data_only=True)
        sheet = wb.active
        for row in sheet.rows:
            line = [cell.value for cell in row]
            if set(line).issubset(EMPTY_SET):  # Skip empty lines
                empty_lines += 1
                if empty_lines == 50:
                    # Probably end of data, ignore other empty lines
                    break
                else:
                    continue

            line = [str(value or "").strip() for value in line]
            if header is None:  # Maybe the header
                line = [slug(value) for value in line]
                if "matricula" in line and "cargo" in line:  # Header line!
                    header = line
            else:  # Regular row
                # TODO: translate field names
                # TODO: convert data types
                row = {field: value for field, value in zip(header, line)}
                for f in [
                        "field_0",
                        "nomeacao",
                        "nome",
                        "matricula",
                        "cargo",
                        "funcao",
                        "lotacao",
                        "remuneracao",
                        "outras_verbas_remuneratorias",
                        "funcao_de_confianca_ou_cargo_em_comissao",
                        "gratificacao_natalina",
                        "ferias_13",
                        "abono_permanencia",
                        "total_de_rendimentos_brutos",
                        "contribuicao_previdenciaria",
                        "irrf",
                        "outros_descontos",
                        "total_de_descontos",
                        "rendimento_liquido_total",
                        "indenizacoes",
                        "outras_remuneracoes_retroativas_eou_temporarias",
                        "field_18",
                ]:
                    if f not in row:
                        row[f] = None
                yield row
Ejemplo n.º 12
0
    def __init__(self, fields, meta=None):
        from rows.plugins import utils

        # TODO: should we really use OrderedDict here?
        # TODO: should use slug on each field name automatically or inside each
        #       plugin?
        self.fields = OrderedDict([
            (utils.slug(field_name), field_type)
            for field_name, field_type in OrderedDict(fields).items()
        ])

        # TODO: should be able to customize row return type (namedtuple, dict
        #       etc.)
        self.Row = namedtuple('Row', self.field_names)
        self._rows = []
        self.meta = dict(meta) if meta is not None else {}
Ejemplo n.º 13
0
    def __init__(self, fields, meta=None):
        from rows.plugins import utils

        # TODO: should we really use OrderedDict here?
        # TODO: should use slug on each field name automatically or inside each
        #       plugin?
        self.fields = OrderedDict(
            [
                (utils.slug(field_name), field_type)
                for field_name, field_type in OrderedDict(fields).items()
            ]
        )

        # TODO: should be able to customize row return type (namedtuple, dict
        #       etc.)
        self.Row = namedtuple("Row", self.field_names)
        self._rows = []
        self.meta = dict(meta) if meta is not None else {}
Ejemplo n.º 14
0
def download_name_stats(full_name):
    first_name = slug(full_name).split('_')[0]

    female = download_name_data(first_name, 'f')
    male = download_name_data(first_name, 'm')

    if female is None and male is None:
        return None

    alternative_names = []
    if female is not None:
        alternative_names += female['alternative_names']
    if male is not None:
        alternative_names += male['alternative_names']
    first_name = female['name'] if female is not None else male['name']
    female_frequency = female['frequency'] if female is not None else None
    male_frequency = male['frequency'] if male is not None else None

    if female_frequency and not male_frequency:
        classification = 'F'
        ratio = 1
    elif male_frequency and not female_frequency:
        classification = 'M'
        ratio = 1
    else:
        total = float(female_frequency + male_frequency)
        if female_frequency >= male_frequency:
            classification = 'F'
            ratio = female_frequency / total
        else:
            classification = 'M'
            ratio = male_frequency / total

    return {
        'alternative_names': sorted(set(alternative_names)),
        'classification': classification,
        'frequency_female': female_frequency,
        'frequency_male': male_frequency,
        'ratio': ratio,
    }
Ejemplo n.º 15
0
def pgimport(
    filename,
    database_uri,
    table_name,
    encoding="utf-8",
    dialect=None,
    create_table=True,
    schema=None,
    callback=None,
    timeout=0.1,
    chunk_size=8388608,
    max_samples=10000,
):
    """Import data from CSV into PostgreSQL using the fastest method

    Required: psql command
    """

    fobj = open_compressed(filename, mode="r", encoding=encoding)
    sample = fobj.read(chunk_size)

    if dialect is None:  # Detect dialect
        dialect = rows.plugins.csv.discover_dialect(
            sample.encode(encoding), encoding=encoding
        )
    elif isinstance(dialect, six.text_type):
        dialect = csv.get_dialect(dialect)

    if schema is None:
        # Detect field names
        reader = csv.reader(io.StringIO(sample), dialect=dialect)
        field_names = [slug(field_name) for field_name in next(reader)]

    else:
        field_names = list(schema.keys())

    if create_table:
        if schema is None:
            data = [
                dict(zip(field_names, row))
                for row in itertools.islice(reader, max_samples)
            ]
            table = rows.import_from_dicts(data)
            field_types = [table.fields[field_name] for field_name in field_names]
        else:
            field_types = list(schema.values())

        columns = [
            "{} {}".format(name, POSTGRESQL_TYPES.get(type_, DEFAULT_POSTGRESQL_TYPE))
            for name, type_ in zip(field_names, field_types)
        ]
        create_table = SQL_CREATE_TABLE.format(
            table_name=table_name, field_types=", ".join(columns)
        )
        execute_command(get_psql_command(create_table, database_uri=database_uri))

    # Prepare the `psql` command to be executed based on collected metadata
    command = get_psql_copy_command(
        database_uri=database_uri,
        dialect=dialect,
        direction="FROM",
        encoding=encoding,
        header=field_names,
        table_name=table_name,
    )
    rows_imported, error = 0, None
    fobj = open_compressed(filename, mode="rb")
    try:
        process = subprocess.Popen(
            shlex.split(command),
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        data = fobj.read(chunk_size)
        total_written = 0
        while data != b"":
            written = process.stdin.write(data)
            total_written += written
            if callback:
                callback(written, total_written)
            data = fobj.read(chunk_size)
        stdout, stderr = process.communicate()
        if stderr != b"":
            raise RuntimeError(stderr.decode("utf-8"))
        rows_imported = int(stdout.replace(b"COPY ", b"").strip())

    except FileNotFoundError:
        raise RuntimeError("Command `psql` not found")

    except BrokenPipeError:
        raise RuntimeError(process.stderr.read().decode("utf-8"))

    return {"bytes_written": total_written, "rows_imported": rows_imported}
Ejemplo n.º 16
0
def first_name(full_name):
    return slug(full_name).split('_')[0].upper()
Ejemplo n.º 17
0
def pgimport(filename,
             database_uri,
             table_name,
             encoding='utf-8',
             create_table=True,
             progress=False,
             timeout=0.1,
             chunk_size=8388608,
             max_samples=10000):
    """Import data from CSV into PostgreSQL using the fastest method

    Required: psql command
    """

    # Extract a sample from the CSV to detect its dialect and header
    fobj = open_compressed(filename, mode='r', encoding=encoding)
    sample = fobj.read(chunk_size).encode(encoding)
    dialect = rows.plugins.csv.discover_dialect(sample, encoding=encoding)
    reader = csv.reader(io.StringIO(sample.decode(encoding)))
    field_names = [slug(field_name) for field_name in next(reader)]

    if create_table:
        data = [
            dict(zip(field_names, row))
            for row in itertools.islice(reader, max_samples)
        ]
        table = rows.import_from_dicts(data)
        field_types = [table.fields[field_name] for field_name in field_names]
        columns = [
            '{} {}'.format(
                name, POSTGRESQL_TYPES.get(type_, DEFAULT_POSTGRESQL_TYPE))
            for name, type_ in zip(field_names, field_types)
        ]
        create_table = SQL_CREATE_TABLE.format(
            table_name=table_name,
            field_types=', '.join(columns),
        )
        execute_command(
            get_psql_command(create_table, database_uri=database_uri))

    # Prepare the `psql` command to be executed based on collected metadata
    command = get_psql_copy_command(
        database_uri=database_uri,
        table_name=table_name,
        header=field_names,
        dialect=dialect,
        encoding=encoding,
    )
    rows_imported, error, total_size = 0, None, None
    try:
        total_size = uncompressed_size(filename)
    except (RuntimeError, ValueError):
        pass

    if progress:
        progress_bar = tqdm(
            desc='Importing data',
            unit='bytes',
            unit_scale=True,
            unit_divisor=1024,
            total=total_size,
        )

    fobj = open_compressed(filename, mode='rb')
    try:
        process = subprocess.Popen(
            shlex.split(command),
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        data = fobj.read(chunk_size)
        while data != b'':
            data_written = process.stdin.write(data)
            if progress:
                progress_bar.update(data_written)
            data = fobj.read(chunk_size)
        stdout, stderr = process.communicate()
        if stderr != b'':
            raise RuntimeError(stderr.decode('utf-8'))
        rows_imported = int(stdout.replace(b'COPY ', b'').strip())

    except FileNotFoundError:
        raise RuntimeError('Command `psql` not found')

    except BrokenPipeError:
        raise RuntimeError(process.stderr.read().decode('utf-8'))

    if progress:
        progress_bar.close()

    return rows_imported