def get_psql_copy_command(table_name, header, encoding, user=None, password=None, host=None, port=None, database_name=None, database_uri=None, dialect=csv.excel): table_name = slug(table_name) header = ', '.join(slug(field_name) for field_name in header) copy = ("\copy {table_name} ({header}) FROM STDIN " "DELIMITER '{delimiter}' " "QUOTE '{quote}' " "ENCODING '{encoding}' " "CSV HEADER;").format(table_name=table_name, header=header, delimiter=dialect.delimiter.replace( "'", "\\'"), quote=dialect.quotechar.replace("'", "\\'"), encoding=encoding) return get_psql_command(copy, user=user, password=password, host=host, port=port, database_name=database_name, database_uri=database_uri)
def test_slug(self): self.assertEqual(plugins_utils.slug('Álvaro Justen'), 'alvaro_justen') self.assertEqual(plugins_utils.slug("Moe's Bar"), 'moes_bar') self.assertEqual(plugins_utils.slug("-----te-----st------"), 'te_st') # As in <https://github.com/turicas/rows/issues/179> self.assertEqual( plugins_utils.slug('Query Occurrence"( % ),"First Seen'), 'query_occurrence_first_seen') self.assertEqual(plugins_utils.slug(' ÁLVARO justen% '), 'alvaro_justen')
def get_psql_copy_command( table_name, header, encoding="utf-8", user=None, password=None, host=None, port=None, database_name=None, database_uri=None, dialect=csv.excel, direction="FROM", ): direction = direction.upper() if direction not in ("FROM", "TO"): raise ValueError('`direction` must be "FROM" or "TO"') table_name = slug(table_name) if header is None: header = "" else: header = ", ".join(slug(field_name) for field_name in header) header = "({header}) ".format(header=header) copy = ( "\copy {table_name} {header}{direction} STDIN " "DELIMITER '{delimiter}' " "QUOTE '{quote}' " "ENCODING '{encoding}' " "CSV HEADER;" ).format( table_name=table_name, header=header, direction=direction, delimiter=dialect.delimiter.replace("'", "''"), quote=dialect.quotechar.replace("'", "''"), encoding=encoding, ) return get_psql_command( copy, user=user, password=password, host=host, port=port, database_name=database_name, database_uri=database_uri, )
def __setitem__(self, key, value): key_type = type(key) if key_type == int: self._rows[key] = self._make_row(value) elif key_type is six.text_type: from rows import fields from rows.plugins import utils values = list(value) # I'm not lazy, sorry if len(values) != len(self): raise ValueError( "Values length ({}) should be the same as " "Table length ({})".format(len(values), len(self)) ) field_name = utils.slug(key) is_new_field = field_name not in self.field_names field_type = fields.detect_types( [field_name], [[value] for value in values] )[field_name] self.fields[field_name] = field_type self.Row = namedtuple("Row", self.field_names) if is_new_field: for row, value in zip(self._rows, values): row.append(field_type.deserialize(value)) else: field_index = self.field_names.index(field_name) for row, value in zip(self._rows, values): row[field_index] = field_type.deserialize(value) else: raise ValueError("Unsupported key type: {}".format(type(key).__name__))
def __setitem__(self, key, value): key_type = type(key) if key_type == int: self._rows[key] = self._make_row(value) elif key_type is six.text_type: from rows import fields from rows.plugins import utils values = list(value) # I'm not lazy, sorry if len(values) != len(self): raise ValueError('Values length ({}) should be the same as ' 'Table length ({})'.format( len(values), len(self))) field_name = utils.slug(key) is_new_field = field_name not in self.field_names field_type = fields.detect_types([field_name], [[value] for value in values])[field_name] self.fields[field_name] = field_type self.Row = namedtuple('Row', self.field_names) if is_new_field: for row, value in zip(self._rows, values): row.append(field_type.deserialize(value)) else: field_index = self.field_names.index(field_name) for row, value in zip(self._rows, values): row[field_index] = field_type.deserialize(value) else: raise ValueError('Unsupported key type: {}'.format( type(key).__name__))
def parse_event(self, response): meta = response.request.meta data = meta["data"].copy() keys = [slug(key) for key in response.xpath("(//ul[@class='details-list'])[1]//li/text()").extract()] values = response.xpath("(//ul[@class='details-list'])[1]//li/span/text()").extract() data.update(dict(zip(keys, values))) ticket = response.xpath("(//a[contains(@href, 'ingresso.festivaldecuritiba.com.br')])[1]/@href").extract() data["ticket_url"] = ticket[0] if ticket else None image = response.xpath("//div[@class='about-image gallery-content']//img/@src").extract() data["image_url"] = image[0] if image else None data["schedule"] = "|".join(response.xpath("//div[@class='event-schedules']//p//text()").extract()) data["description"] = "\n".join(response.xpath("//div[@class='event-description']//p//text()").extract()).replace("\xa0", " ") cast = [] for line in response.xpath("//div[@class='event-datasheet']//p"): cast.append(" ".join(line.xpath(".//text()").extract()).replace(" ", " ")) data["cast"] = "|".join(cast) translate_keys = [ ("evento", "category"), ("genero", "genre"), ("classificacao", "classification"), ("duracao", "duration"), ("valor", "value"), ] for original, new in translate_keys: if original in data: data[new] = data.pop(original) yield data
def name(self): '''Define table name based on its metadata (filename used on import) If `filename` is not available, return `table1`.''' from rows.plugins import utils # TODO: may try read meta['name'] also (some plugins may set it) name = os.path.basename(self.meta.get('filename', 'table1')) return utils.slug(os.path.splitext(name)[0])
def name(self): """Define table name based on its metadata (filename used on import) If `filename` is not available, return `table1`. """ from rows.plugins import utils # TODO: may try read meta['name'] also (some plugins may set it) name = os.path.basename(self.meta.get("filename", "table1")) return utils.slug(os.path.splitext(name)[0])
def test_slug(self): self.assertEqual(plugins_utils.slug(None), "") self.assertEqual(plugins_utils.slug("Álvaro Justen"), "alvaro_justen") self.assertEqual(plugins_utils.slug("Moe's Bar"), "moes_bar") self.assertEqual(plugins_utils.slug("-----te-----st------"), "te_st") # As in <https://github.com/turicas/rows/issues/179> self.assertEqual( plugins_utils.slug('Query Occurrence"( % ),"First Seen'), "query_occurrence_first_seen", ) self.assertEqual(plugins_utils.slug(" ÁLVARO justen% "), "alvaro_justen") self.assertEqual(plugins_utils.slug(42), "42")
def extract(self): header, empty_lines = None, 0 wb = load_workbook(self.filename, read_only=True, data_only=True) sheet = wb.active for row in sheet.rows: line = [cell.value for cell in row] if set(line).issubset(EMPTY_SET): # Skip empty lines empty_lines += 1 if empty_lines == 50: # Probably end of data, ignore other empty lines break else: continue line = [str(value or "").strip() for value in line] if header is None: # Maybe the header line = [slug(value) for value in line] if "matricula" in line and "cargo" in line: # Header line! header = line else: # Regular row # TODO: translate field names # TODO: convert data types row = {field: value for field, value in zip(header, line)} for f in [ "field_0", "nomeacao", "nome", "matricula", "cargo", "funcao", "lotacao", "remuneracao", "outras_verbas_remuneratorias", "funcao_de_confianca_ou_cargo_em_comissao", "gratificacao_natalina", "ferias_13", "abono_permanencia", "total_de_rendimentos_brutos", "contribuicao_previdenciaria", "irrf", "outros_descontos", "total_de_descontos", "rendimento_liquido_total", "indenizacoes", "outras_remuneracoes_retroativas_eou_temporarias", "field_18", ]: if f not in row: row[f] = None yield row
def __init__(self, fields, meta=None): from rows.plugins import utils # TODO: should we really use OrderedDict here? # TODO: should use slug on each field name automatically or inside each # plugin? self.fields = OrderedDict([ (utils.slug(field_name), field_type) for field_name, field_type in OrderedDict(fields).items() ]) # TODO: should be able to customize row return type (namedtuple, dict # etc.) self.Row = namedtuple('Row', self.field_names) self._rows = [] self.meta = dict(meta) if meta is not None else {}
def __init__(self, fields, meta=None): from rows.plugins import utils # TODO: should we really use OrderedDict here? # TODO: should use slug on each field name automatically or inside each # plugin? self.fields = OrderedDict( [ (utils.slug(field_name), field_type) for field_name, field_type in OrderedDict(fields).items() ] ) # TODO: should be able to customize row return type (namedtuple, dict # etc.) self.Row = namedtuple("Row", self.field_names) self._rows = [] self.meta = dict(meta) if meta is not None else {}
def download_name_stats(full_name): first_name = slug(full_name).split('_')[0] female = download_name_data(first_name, 'f') male = download_name_data(first_name, 'm') if female is None and male is None: return None alternative_names = [] if female is not None: alternative_names += female['alternative_names'] if male is not None: alternative_names += male['alternative_names'] first_name = female['name'] if female is not None else male['name'] female_frequency = female['frequency'] if female is not None else None male_frequency = male['frequency'] if male is not None else None if female_frequency and not male_frequency: classification = 'F' ratio = 1 elif male_frequency and not female_frequency: classification = 'M' ratio = 1 else: total = float(female_frequency + male_frequency) if female_frequency >= male_frequency: classification = 'F' ratio = female_frequency / total else: classification = 'M' ratio = male_frequency / total return { 'alternative_names': sorted(set(alternative_names)), 'classification': classification, 'frequency_female': female_frequency, 'frequency_male': male_frequency, 'ratio': ratio, }
def pgimport( filename, database_uri, table_name, encoding="utf-8", dialect=None, create_table=True, schema=None, callback=None, timeout=0.1, chunk_size=8388608, max_samples=10000, ): """Import data from CSV into PostgreSQL using the fastest method Required: psql command """ fobj = open_compressed(filename, mode="r", encoding=encoding) sample = fobj.read(chunk_size) if dialect is None: # Detect dialect dialect = rows.plugins.csv.discover_dialect( sample.encode(encoding), encoding=encoding ) elif isinstance(dialect, six.text_type): dialect = csv.get_dialect(dialect) if schema is None: # Detect field names reader = csv.reader(io.StringIO(sample), dialect=dialect) field_names = [slug(field_name) for field_name in next(reader)] else: field_names = list(schema.keys()) if create_table: if schema is None: data = [ dict(zip(field_names, row)) for row in itertools.islice(reader, max_samples) ] table = rows.import_from_dicts(data) field_types = [table.fields[field_name] for field_name in field_names] else: field_types = list(schema.values()) columns = [ "{} {}".format(name, POSTGRESQL_TYPES.get(type_, DEFAULT_POSTGRESQL_TYPE)) for name, type_ in zip(field_names, field_types) ] create_table = SQL_CREATE_TABLE.format( table_name=table_name, field_types=", ".join(columns) ) execute_command(get_psql_command(create_table, database_uri=database_uri)) # Prepare the `psql` command to be executed based on collected metadata command = get_psql_copy_command( database_uri=database_uri, dialect=dialect, direction="FROM", encoding=encoding, header=field_names, table_name=table_name, ) rows_imported, error = 0, None fobj = open_compressed(filename, mode="rb") try: process = subprocess.Popen( shlex.split(command), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) data = fobj.read(chunk_size) total_written = 0 while data != b"": written = process.stdin.write(data) total_written += written if callback: callback(written, total_written) data = fobj.read(chunk_size) stdout, stderr = process.communicate() if stderr != b"": raise RuntimeError(stderr.decode("utf-8")) rows_imported = int(stdout.replace(b"COPY ", b"").strip()) except FileNotFoundError: raise RuntimeError("Command `psql` not found") except BrokenPipeError: raise RuntimeError(process.stderr.read().decode("utf-8")) return {"bytes_written": total_written, "rows_imported": rows_imported}
def first_name(full_name): return slug(full_name).split('_')[0].upper()
def pgimport(filename, database_uri, table_name, encoding='utf-8', create_table=True, progress=False, timeout=0.1, chunk_size=8388608, max_samples=10000): """Import data from CSV into PostgreSQL using the fastest method Required: psql command """ # Extract a sample from the CSV to detect its dialect and header fobj = open_compressed(filename, mode='r', encoding=encoding) sample = fobj.read(chunk_size).encode(encoding) dialect = rows.plugins.csv.discover_dialect(sample, encoding=encoding) reader = csv.reader(io.StringIO(sample.decode(encoding))) field_names = [slug(field_name) for field_name in next(reader)] if create_table: data = [ dict(zip(field_names, row)) for row in itertools.islice(reader, max_samples) ] table = rows.import_from_dicts(data) field_types = [table.fields[field_name] for field_name in field_names] columns = [ '{} {}'.format( name, POSTGRESQL_TYPES.get(type_, DEFAULT_POSTGRESQL_TYPE)) for name, type_ in zip(field_names, field_types) ] create_table = SQL_CREATE_TABLE.format( table_name=table_name, field_types=', '.join(columns), ) execute_command( get_psql_command(create_table, database_uri=database_uri)) # Prepare the `psql` command to be executed based on collected metadata command = get_psql_copy_command( database_uri=database_uri, table_name=table_name, header=field_names, dialect=dialect, encoding=encoding, ) rows_imported, error, total_size = 0, None, None try: total_size = uncompressed_size(filename) except (RuntimeError, ValueError): pass if progress: progress_bar = tqdm( desc='Importing data', unit='bytes', unit_scale=True, unit_divisor=1024, total=total_size, ) fobj = open_compressed(filename, mode='rb') try: process = subprocess.Popen( shlex.split(command), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) data = fobj.read(chunk_size) while data != b'': data_written = process.stdin.write(data) if progress: progress_bar.update(data_written) data = fobj.read(chunk_size) stdout, stderr = process.communicate() if stderr != b'': raise RuntimeError(stderr.decode('utf-8')) rows_imported = int(stdout.replace(b'COPY ', b'').strip()) except FileNotFoundError: raise RuntimeError('Command `psql` not found') except BrokenPipeError: raise RuntimeError(process.stderr.read().decode('utf-8')) if progress: progress_bar.close() return rows_imported