def test_join_feature(self): tables = [rows.import_from_csv('tests/data/to-merge-1.csv'), rows.import_from_csv('tests/data/to-merge-2.csv'), rows.import_from_csv('tests/data/to-merge-3.csv'),] merged = rows.join(keys=('id', 'username'), tables=tables) expected = rows.import_from_csv('tests/data/merged.csv') self.assert_table_equal(merged, expected)
def test_real_data_3(self): filename = "tests/data/eleicoes-tcesp-161-162.pdf" expected1 = "tests/data/expected-eleicoes-tcesp-161-{}.csv".format(self.backend) expected2 = "tests/data/expected-eleicoes-tcesp-162-{}.csv".format(self.backend) begin = re.compile("Documento gerado em.*") end = re.compile("Página: [0-9]+ de.*") result = rows.import_from_pdf( filename, backend=self.backend, page_numbers=(1,), starts_after=begin, ends_before=end, algorithm="header-position", ) expected = rows.import_from_csv(expected1) self.assertEqual(list(expected), list(result)) result = rows.import_from_pdf( filename, backend=self.backend, page_numbers=(2,), starts_after=begin, ends_before=end, algorithm="header-position", ) expected = rows.import_from_csv(expected2) self.assertEqual(list(expected), list(result))
def test_real_data_3(self): filename = "tests/data/eleicoes-tcesp-161-162.pdf" expected1 = "tests/data/expected-eleicoes-tcesp-161-{}.csv".format( self.backend) expected2 = "tests/data/expected-eleicoes-tcesp-162-{}.csv".format( self.backend) begin = re.compile("Documento gerado em.*") end = re.compile("Página: [0-9]+ de.*") result = rows.import_from_pdf( filename, backend=self.backend, page_numbers=(1, ), starts_after=begin, ends_before=end, algorithm="header-position", ) expected = rows.import_from_csv(expected1) self.assertEqual(list(expected), list(result)) result = rows.import_from_pdf( filename, backend=self.backend, page_numbers=(2, ), starts_after=begin, ends_before=end, algorithm="header-position", ) expected = rows.import_from_csv(expected2) self.assertEqual(list(expected), list(result))
def test_join_feature(self): tables = [ rows.import_from_csv('tests/data/to-merge-1.csv'), rows.import_from_csv('tests/data/to-merge-2.csv'), rows.import_from_csv('tests/data/to-merge-3.csv'), ] merged = rows.join(keys=('id', 'username'), tables=tables) expected = rows.import_from_csv('tests/data/merged.csv') self.assert_table_equal(merged, expected)
def test_import_from_csv_discover_dialect(self, mocked_create_table): data, lines = make_csv_data(quote_char="'", field_delimiter=";", line_delimiter="\r\n") fobj = BytesIO() fobj.write(lines.encode('utf-8')) fobj.seek(0) rows.import_from_csv(fobj) call_args = mocked_create_table.call_args_list[0] self.assertEqual(data, list(call_args[0][0]))
def test_import_from_csv_retrieve_desired_data(self, mocked_create_table): mocked_create_table.return_value = 42 # import using filename table_1 = rows.import_from_csv(self.filename) call_args = mocked_create_table.call_args_list[0] self.assert_create_table_data(call_args) # import using fobj with open(self.filename, 'rb') as fobj: table_2 = rows.import_from_csv(fobj) call_args = mocked_create_table.call_args_list[1] self.assert_create_table_data(call_args)
def test_import_from_csv_retrieve_desired_data(self, mocked_create_table): mocked_create_table.return_value = 42 # import using filename rows.import_from_csv(self.filename) call_args = mocked_create_table.call_args_list[0] self.assert_create_table_data(call_args) # import using fobj with open(self.filename, 'rb') as fobj: rows.import_from_csv(fobj) call_args = mocked_create_table.call_args_list[1] self.assert_create_table_data(call_args)
def test_both_confirmed_cases_and_deaths_columns_must_be_filled(self): original_content = self.content # missing confirmed cases self.content = original_content.replace("Abatiá,9,1", "Abatiá,,1") file_rows = rows.import_from_csv(self.file_from_content) with pytest.raises(SpreadsheetValidationErrors): format_spreadsheet_rows_as_dict(file_rows, self.date, self.uf) # missing deaths self.content = original_content.replace("Abatiá,9,1", "Abatiá,9,") file_rows = rows.import_from_csv(self.file_from_content) with pytest.raises(SpreadsheetValidationErrors): format_spreadsheet_rows_as_dict(file_rows, self.date, self.uf)
def test_both_confirmed_cases_and_deaths_columns_must_be_integers(self): original_content = self.content # confirmed cases as float self.content = original_content.replace("Abatiá,9,1", "Abatiá,9.10,1") file_rows = rows.import_from_csv(self.file_from_content) with pytest.raises(SpreadsheetValidationErrors): format_spreadsheet_rows_as_dict(file_rows, self.date, self.uf) # deaths as float self.content = original_content.replace("Abatiá,9,1", "Abatiá,9,1.10") file_rows = rows.import_from_csv(self.file_from_content) with pytest.raises(SpreadsheetValidationErrors): format_spreadsheet_rows_as_dict(file_rows, self.date, self.uf)
def merge_files(filenames, output): 'Merge all game files into one CSV file, adding year and country columns' if not output.parent.exists(): output.parent.mkdir() countries_by_code = download_countries() games = rows.Table(fields=FULL_FIELDS) for filename in filenames: year, country_code = _parse_filename(filename) country = countries_by_code[country_code] print('Merging year: {}, country: {}...'.format(year, country.name)) game = rows.import_from_csv(str(filename.absolute()), fields=FIELDS, dialect=csv.excel, encoding='utf-8') for row in game: data = row._asdict() data['year'] = year data['country_code'] = country_code data['country_name'] = country.name del data['rk'] games.append(data) games.order_by('-year') rows.utils.export_to_uri(games, str(output.absolute()))
def test_not_valid_if_sum_of_deaths_does_not_matches_with_total(self): self.content = self.content.replace('TOTAL NO ESTADO,102,32', 'TOTAL NO ESTADO,102,50') file_rows = rows.import_from_csv(self.file_from_content) with pytest.raises(SpreadsheetValidationErrors): format_spreadsheet_rows_as_dict(file_rows, self.date, self.uf)
def from_idh_csv(filename): entries = rows.import_from_csv(filename) # lista de codes codes = [ e.code.strip() for e in sorted(entries, key=lambda x: x.idh, reverse=True) ] #lista de language languages = [ e.language.strip() for e in sorted(entries, key=lambda x: x.idh, reverse=True) ] codelang = remove_consecutives({'codes': codes, 'languages': languages}) chain_str = ' - '.join(codes) cprint.ok(f"Translation chain: {chain_str}.") cprint.ok(f"Input text: {TEXT}\n") start_codelang = {'codes': 'pt', 'languages': 'Portuguese'} text, result = chain_translate_text(TEXT, start_codelang, codelang, monitoring=False) cprint.ok("\n##### RESULTS ######\n") cprint.ok(text) print() cprint.ok(result)
def prepara_data_table(self): try: data = rows.import_from_csv(self.csv_file) except Exception as e: print('Arquivo csv inválido, ou não existe') raise e return data
def find_unfinished(): unfinished = [] subtitles = rows.import_from_csv(CSV_FILE) for s in subtitles: if s.field_4 == NAME and s.field_5 != '已完成': unfinished.append(s) return unfinished
def test_detect_weird_dialect(self): temp = tempfile.NamedTemporaryFile(delete=False) filename = "{}.{}".format(temp.name, self.file_extension) self.files_to_delete.append(filename) # If the sniffer reads only the first line, it will think the delimiter # is ',' instead of ';' encoding = "utf-8" data = BytesIO( textwrap.dedent( """ field1|field2|field3|field4 1|2|3|4 5|6|7|8 9|0|1|2 """ ) .strip() .encode(encoding) ) table = rows.import_from_csv(data, encoding=encoding, lazy=False) self.assertEqual(table.field_names, ["field1", "field2", "field3", "field4"]) expected = [[1, 2, 3, 4], [5, 6, 7, 8], [9, 0, 1, 2]] for expected_data, row in zip(expected, table): row = [row.field1, row.field2, row.field3, row.field4] self.assertEqual(expected_data, row)
def form_valid(self, form): registros = rows.import_from_csv(form.cleaned_data['arquivo']) identificador = uuid4() for registro in registros: cliente = Cliente.objects.get_or_create( nome=registro.purchaser_name)[0] comerciante = Comerciante.objects.get_or_create( nome=registro.merchant_name)[0] endereco_comerciante = comerciante.enderecos.get_or_create( endereco=registro.merchant_address)[0] produto = Produto.objects.get_or_create( descricao=registro.item_description)[0] Venda.objects.create(cliente=cliente, comerciante=comerciante, endereco_comerciante=endereco_comerciante, produto=produto, valor_unitario=registro.item_price, quantidade=registro.purchase_count, identificador=identificador) total = number_format(Venda.objects.receita_bruta_total(identificador)) messages.success( self.request, f'Importação realizada. Receita Bruta Total: {total}.') return super().form_valid(form)
def test_skip_sum_validations_if_flagged_in_the_form_data( self, mocked_format): mocked_format.return_value = (["results", "list"], ["warnings", "list"]) self.data.update({ "skip_sum_cases": True, "skip_sum_deaths": True, }) form = StateSpreadsheetForm(self.data, self.file_data, user=self.user) assert form.is_valid(), form.errors expected = { "table": ["results", "list"], "errors": [], "warnings": ["warnings", "list"], } spreadsheet = form.save() spreadsheet.refresh_from_db() assert expected == spreadsheet.data assert 1 == mocked_format.call_count method_call = mocked_format.call_args_list[0] data, import_date, state = method_call[0] kwargs = method_call[1] assert date.today() == import_date assert state == "PR" for entry, expected_entry in zip( data, rows.import_from_csv(self.file_data["file"])): assert entry._asdict() == expected_entry._asdict() assert kwargs["skip_sum_cases"] is True assert kwargs["skip_sum_deaths"] is True
def read_cases(input_filename, order_by=None): cases = rows.import_from_csv(input_filename, force_types=load_schema( str(SCHEMA_PATH / "caso.csv"))) if order_by: cases.order_by(order_by) return cases
def test_detect_dialect_using_json(self): temp = tempfile.NamedTemporaryFile(delete=False) filename = '{}.{}'.format(temp.name, self.file_extension) encoding = 'utf-8' self.files_to_delete.append(filename) # Using JSON will force the sniffer to do not include ':', '}' in the # possible delimiters table = rows.Table(fields=OrderedDict([ ('jsoncolumn1', rows.fields.JSONField), ('jsoncolumn2', rows.fields.JSONField), ])) table.append({ 'jsoncolumn1': '{"a": 42}', 'jsoncolumn2': '{"b": 43}', }) table.append({ 'jsoncolumn1': '{"c": 44}', 'jsoncolumn2': '{"d": 45}', }) rows.export_to_csv(table, filename, encoding=encoding) table = rows.import_from_csv(filename, encoding=encoding) self.assertEqual(table.field_names, ['jsoncolumn1', 'jsoncolumn2']) self.assertDictEqual(table[0].jsoncolumn1, {'a': 42}) self.assertDictEqual(table[0].jsoncolumn2, {'b': 43}) self.assertDictEqual(table[1].jsoncolumn1, {'c': 44}) self.assertDictEqual(table[1].jsoncolumn2, {'d': 45})
def test_confirmed_cases_must_be_equal_or_greater_than_deaths(self): original_content = self.content self.content = original_content.replace("Abatiá,9,1", "Abatiá,9,20") file_rows = rows.import_from_csv(self.file_from_content) with pytest.raises(SpreadsheetValidationErrors): format_spreadsheet_rows_as_dict(file_rows, self.date, self.uf)
def test_quotes(self): temp = tempfile.NamedTemporaryFile(delete=False) filename = '{}.{}'.format(temp.name, self.file_extension) self.files_to_delete.append(filename) table = rows.Table(fields=OrderedDict([ ('field_1', rows.fields.TextField), ('field_2', rows.fields.TextField), ('field_3', rows.fields.TextField), ('field_4', rows.fields.TextField), ])) table.append({ 'field_1': '"quotes"', 'field_2': 'test "quotes"', 'field_3': '"quotes" test', 'field_4': 'test "quotes" test', }) # we need this line row since `"quotes"` on `field_1` could be # `JSONField` or `TextField` table.append({ 'field_1': 'noquotes', 'field_2': 'test "quotes"', 'field_3': '"quotes" test', 'field_4': 'test "quotes" test', }) rows.export_to_csv(table, filename) table2 = rows.import_from_csv(filename) self.assert_table_equal(table, table2)
def start_requests(self): links = rows.import_from_csv(settings.OUTPUT_PATH / "filiacao-links.csv") for row in links: yield scrapy.Request(url="file://" + str(Path(row.filename).absolute()), meta=row._asdict())
def serialize_cases(buffer): for case in import_from_csv(buffer): case = case._asdict() data = { new_label: case.get(old_label) for old_label, new_label in CASE_LABELS }
def parse(self, response): url_table = rows.import_from_csv( io.BytesIO(response.body), encoding="utf-8", force_types=HASH_FIELDS, ) self.URLInfo = url_table.Row self.url_hashes = {row.url: row for row in url_table} yield scrapy.Request(URL_LIST_URL, callback=self.parse_url_list)
def situacao_candidatura(self): return {( row.codigo_situacao_candidatura, row.situacao_candidatura, ): row.nova_situacao_candidatura for row in rows.import_from_csv( settings.HEADERS_PATH / f"situacao-candidatura.csv", )}
def test_export_to_csv_filename(self): # TODO: may test file contents temp = tempfile.NamedTemporaryFile(delete=False) self.files_to_delete.append(temp.name) rows.export_to_csv(utils.table, temp.name) table = rows.import_from_csv(temp.name) self.assert_table_equal(table, utils.table)
def read_epidemiological_week(): filename = "data/epidemiological-week.csv" table = rows.import_from_csv(filename) return { row.date: int(f"{row.epidemiological_year}{row.epidemiological_week:02d}") for row in table }
def parse(self, response): table = rows.import_from_csv(io.BytesIO(response.body), encoding="utf-8") for row in table: yield scrapy.Request( gdocs_xlsx_download_url(row.planilha_brasilio), meta={"state": row.uf}, callback=self.parse_state_file, )
def get_cities(): table = rows.import_from_csv( POPULATION_DATA_PATH, force_types=load_schema(str(POPULATION_SCHEMA_PATH)), ) cities = defaultdict(dict) for row in table: cities[row.state][row.city] = row return cities
def test_import_from_csv_fobj(self): # TODO: may test with codecs.open passing an encoding with open(self.filename) as fobj: table = rows.import_from_csv(fobj, encoding=self.encoding) self.assert_table_equal(table, utils.table) expected_meta = {'imported_from': 'csv', 'filename': self.filename,} self.assertEqual(table.meta, expected_meta)
def get_actions_for_state(state): url = "https://docs.google.com/spreadsheets/d/1epxFffymqv1t2s37rQ-p5eKpvecOIBzCfJPLI53wYTY/export?format=csv&id=1epxFffymqv1t2s37rQ-p5eKpvecOIBzCfJPLI53wYTY&gid=1565988556" response = requests.get(url) table = rows.import_from_csv(io.BytesIO(response.content), encoding="utf-8") return [ Action(year=row.ano, state=row.estado, name=row.nome_acao, code=row.codigo_acao) for row in table if row.estado == state and all((row.ano, row.estado, row.codigo_acao, row.nome_acao)) ]
def read_epidemiological_week(): # TODO: use pkg_resources to get correct path filename = "covid19br/data/epidemiological-week.csv" table = rows.import_from_csv(filename) return { row.date: int(f"{row.epidemiological_year}{row.epidemiological_week:02d}") for row in table }
def test_validate_if_all_cities_exists_are_in_the_state(self): file_rows = rows.import_from_csv(self.file_from_content) with pytest.raises(SpreadsheetValidationErrors) as execinfo: format_spreadsheet_rows_as_dict(file_rows, self.date, "SP") exception = execinfo.value assert "Abatiá não pertence à UF SP" in exception.error_messages assert "Adrianópolis não pertence à UF SP" in exception.error_messages
def test_real_data_2(self): filename = "tests/data/milho-safra-2017" result = rows.import_from_pdf( filename + ".pdf", backend=self.backend, starts_after=re.compile("MILHO SAFRA 16/17: ACOMPANHAMENTO DE .*"), ends_before="*Variação em pontos percentuais.", ) expected = rows.import_from_csv(filename + ".csv") self.assertEqual(list(expected), list(result))
def test_import_from_csv_uses_create_table(self, mocked_create_table): mocked_create_table.return_value = 42 kwargs = {'encoding': 'utf-8', 'some_key': 123, 'other': 456, } result = rows.import_from_csv(self.filename, **kwargs) self.assertTrue(mocked_create_table.called) self.assertEqual(mocked_create_table.call_count, 1) self.assertEqual(result, 42) call = mocked_create_table.call_args kwargs['meta'] = {'imported_from': 'csv', 'filename': self.filename, } self.assertEqual(call[1], kwargs)
def test_rects_boundaries(self): filename = "tests/data/ibama-autuacao-amazonas-2010-pag2" result = rows.import_from_pdf( filename + ".pdf", backend=self.backend, starts_after=re.compile("DIRETORIA DE PROTE.*"), ends_before=re.compile("Pag [0-9]+/[0-9]+"), algorithm="rects-boundaries", ) expected = rows.import_from_csv(filename + ".csv") self.assertEqual(list(expected), list(result))
def test_issue_168(self): temp = tempfile.NamedTemporaryFile(delete=False) filename = "{}.{}".format(temp.name, self.file_extension) self.files_to_delete.append(filename) table = rows.Table(fields=OrderedDict([("jsoncolumn", rows.fields.JSONField)])) table.append({"jsoncolumn": '{"python": 42}'}) rows.export_to_csv(table, filename) table2 = rows.import_from_csv(filename) self.assert_table_equal(table, table2)
def test_import_field_limit(self): temp = tempfile.NamedTemporaryFile(delete=False) filename = "{}.{}".format(temp.name, self.file_extension) self.files_to_delete.append(filename) table = rows.import_from_dicts([{"f1": "a" * 132000}]) rows.export_to_csv(table, filename) # The following line must not raise the exception: # `_csv.Error: field larger than field limit (131072)` new = rows.import_from_csv(filename)
def test_export_to_csv_filename(self): # TODO: may test file contents temp = tempfile.NamedTemporaryFile(delete=False) self.files_to_delete.append(temp.name) rows.export_to_csv(utils.table, temp.name) table = rows.import_from_csv(temp.name) self.assert_table_equal(table, utils.table) temp.file.seek(0) result = temp.file.read() export_in_memory = rows.export_to_csv(utils.table, None) self.assertEqual(result, export_in_memory)
def setUp(self): rows_xpath = '//*[@class="entry-container"]/*[@class="row-fluid"]/*[@class="span6"]' fields_xpath = OrderedDict([ ('url', './/h2/a/@href'), ('name', './/h2/a/text()'), ('address', './/div[@class="spField field_direccion"]/text()'), ('phone', './/div[@class="spField field_telefono"]/text()'), ('website', './/div[@class="spField field_sitio_web"]/text()'), ('email', './/div[@class="spField field_email"]/text()'), ]) self.kwargs = {'rows_xpath': rows_xpath, 'fields_xpath': fields_xpath, } self.expected_table = rows.import_from_csv(self.expected_data) self.files_to_delete = []
def test_import_from_csv_uses_create_table(self, mocked_create_table): mocked_create_table.return_value = 42 kwargs = {"some_key": 123, "other": 456} result = rows.import_from_csv(self.filename, encoding="utf-8", **kwargs) self.assertTrue(mocked_create_table.called) self.assertEqual(mocked_create_table.call_count, 1) self.assertEqual(result, 42) call = mocked_create_table.call_args kwargs["meta"] = { "imported_from": "csv", "filename": self.filename, "encoding": "utf-8", } self.assertEqual(call[1], kwargs)
def test_import_from_xpath_filename(self): table = rows.import_from_xpath(self.filename, encoding=self.encoding, **self.kwargs) expected_meta = {'imported_from': 'xpath', 'filename': self.filename,} self.assertEqual(table.meta, expected_meta) temp = tempfile.NamedTemporaryFile(delete=False) self.files_to_delete.append(temp.name) fobj = temp.file rows.export_to_csv(table, fobj) fobj.seek(0) table = rows.import_from_csv(fobj) self.assert_table_equal(table, self.expected_table)
def import_data(data_dir): for f in csv_files(data_dir): csv = rows.import_from_csv(str(f.realpath())) logging.warning('Import file:' + str(f.realpath())) csv = csv[::-1] s = sessions_history(csv[0]) s['prev_close'] = csv[0].open sh = Sessions_History() sh.set(**s) commit() size = len(csv) for i in range(1, size): s = sessions_history(csv[i]) s['prev_close'] = csv[i - 1].close sh = Sessions_History() sh.set(**s) commit()
def setUp(self): rows_xpath = ( '//*[@class="entry-container"]/*[@class="row-fluid"]/*[@class="span6"]' ) fields_xpath = OrderedDict( [ ("url", ".//h2/a/@href"), ("name", ".//h2/a/text()"), ("address", './/div[@class="spField field_direccion"]/text()'), ("phone", './/div[@class="spField field_telefono"]/text()'), ("website", './/div[@class="spField field_sitio_web"]/text()'), ("email", './/div[@class="spField field_email"]/text()'), ] ) self.kwargs = {"rows_xpath": rows_xpath, "fields_xpath": fields_xpath} self.expected_table = rows.import_from_csv(self.expected_data) self.files_to_delete = []
def test_import_from_csv_discover_dialect_decode_error(self): # Create a 1024-bytes line (if encoded to ASCII, UTF-8 etc.) line = '"' + ("a" * 508) + '", "' + ("b" * 508) + '"\r\n' lines = 256 * line # 256KiB # Now change the last byte (in the 256KiB sample) to have half of a # character representation (when encoded to UTF-8) data = lines[:-3] + '++Á"\r\n' data = data.encode("utf-8") # Should not raise `UnicodeDecodeError` table = rows.import_from_csv( BytesIO(data), encoding="utf-8", sample_size=262144 ) last_row = table[-1] last_column = "b" * 508 self.assertEqual(getattr(last_row, last_column), "b" * 508 + "++Á")
def run(self): google_username = "******" google_password = "******" path = "/home/vagner/workspace/ITSGoogleTrends/output/" # connect to Google try: self.show("Realizando conexão com usuário " + google_username) connector = pyGTrends(google_username, google_password) self.show("Conexão realizada com sucesso") except Exception as e: raise ITSGoogleTrendsError ("Erro durante a conexão com o Google.") #montando a string de requisicao # Lendo os dados do arquivo csv table = rows.import_from_csv(self._CSV_FILE_PATH + self._CSV_FILE_NAME) rows_number = len(table) index = 0 for row in table: its_name = str(row.system).lower() index = index + 1 self.show("Início da busca dos dados de tendência para o ITS: {0:s} [{1:d}/{2:d}]".format(its_name,index,rows_number)) str_request = self._DEFAUT_KEYWORD + "," + its_name self.show("Realizando uma requisição com a sentença " + str_request) # make request connector.request_report(str_request) # download file self._now = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) csv_file_name = "{0:s}-{1:s}".format(self._now, str_request.replace(",", "-").replace(" ","-")) connector.save_csv(path, csv_file_name) self.show("Resultados escritos no arquivos {0:s}.csv".format(csv_file_name)) # wait a random amount of time between requests to avoid bot detection wait_time = randint(5, 10) self.show("Aguardando {0:d} segundos para uma nova requisição".format(wait_time)) time.sleep(wait_time) #end for self.show("Execução realizada com sucesso!")
def test_import_from_xpath_fobj(self): # TODO: may test with codecs.open passing an encoding with open(self.filename, mode='rb') as fobj: table = rows.import_from_xpath(fobj, encoding=self.encoding, **self.kwargs) expected_meta = {'imported_from': 'xpath', 'filename': self.filename, 'encoding': self.encoding, } self.assertEqual(table.meta, expected_meta) temp = tempfile.NamedTemporaryFile(delete=False) self.files_to_delete.append(temp.name) fobj = temp.file rows.export_to_csv(table, fobj) fobj.seek(0) table = rows.import_from_csv(fobj) self.assert_table_equal(table, self.expected_table)
def test_detect_dialect_more_data(self): temp = tempfile.NamedTemporaryFile(delete=False) filename = '{}.{}'.format(temp.name, self.file_extension) self.files_to_delete.append(filename) # If the sniffer reads only the first line, it will think the delimiter # is ',' instead of ';' data = textwrap.dedent(''' field1,samefield;field2,other row1value1;row1value2 row2value1;row2value2 ''').strip() with open(filename, 'wb') as fobj: fobj.write(data.encode('utf-8')) table = rows.import_from_csv(filename, encoding='utf-8') self.assertEqual(table.field_names, ['field1samefield', 'field2other']) self.assertEqual(table[0].field1samefield, 'row1value1') self.assertEqual(table[0].field2other, 'row1value2') self.assertEqual(table[1].field1samefield, 'row2value1') self.assertEqual(table[1].field2other, 'row2value2')
# coding: utf-8 from __future__ import division from collections import OrderedDict import rows import re table = rows.import_from_csv('../0cleaning/clean_data2.csv') quantidade_de_exemplos = len(table) statistics = {} for row in table: drow = row.__dict__ for key, value in drow.items(): if key not in statistics: statistics[key] = {} if value not in statistics[key]: statistics[key][value] = 0 statistics[key][value] += 1 string = rows.fields.UnicodeField table_output = rows.Table(fields=OrderedDict({'column': string, 'value': string, 'amount': string, 'percent': string})) for key in statistics.keys(): for value in statistics[key].keys(): table_output.append({ 'column': key, 'value': value, 'amount': statistics[key][value], 'percent': "{0:.2f}".format(statistics[key][value] / quantidade_de_exemplos * 100)
def table(self): """Returns the table with all data in rows format.""" file_data = self._file_data() data = StringIO.StringIO(file_data) table = rows.import_from_csv(data) return table
# This example was based on: # https://github.com/compjour/search-script-scrape/blob/master/scripts/101.py from io import BytesIO import requests import rows # Capture url = "http://unitedstates.sunlightfoundation.com/legislators/legislators.csv" csv = BytesIO(requests.get(url).content) # Normalize table = rows.import_from_csv(csv) # Analyze total = len(table) total_in_office = sum(1 for row in table if row.in_office) men = sum(1 for row in table if row.gender == "M") men_in_office = sum(1 for row in table if row.gender == "M" and row.in_office) women = sum(1 for row in table if row.gender == "F") women_in_office = sum(1 for row in table if row.gender == "F" and row.in_office) # View print( " Men: {}/{} ({:02.2f}%), in office: {}/{} ({:02.2f}%)".format( men, total, 100 * men / float(total),
def update_data_for_year(self, year): self.debug(u'Updating data for year {0}'.format(year)) try: csv_data = self.retrieve_data_for_year(year).replace('\r\n', '\n') except Exception: print u'Not found data for year {0}'.format(year) return # Skip first line head, tail = csv_data.split('\n', 1) self.debug(u'Reading file...') data = rows.import_from_csv(BytesIO(tail.encode('utf-8'))) if not data: self.debug(u'Error downloading file for year {0}'.format(year)) return expected_header = [ u'ano', u'mes', u'senador', u'tipo_despesa', u'cnpj_cpf', u'fornecedor', u'documento', u'data', u'detalhamento', u'valor_reembolsado', ] actual_header = data.fields.keys() if actual_header != expected_header: # FIXME print u'Bad CSV: expected header {0}, got {1}'.format( expected_header, actual_header ) return archived_expense_list = [] objects_counter = 0 archived_expense_list_counter = len(data) legislators = {} mandates = {} natures = {} for row in data: if not row.senador: self.debug(u'Error downloading file for year {0}') continue if not row.data: date = '01/{0}/{1}'.format(row.mes, row.ano) expense_date = datetime.strptime(date, '%d/%m/%Y') else: expense_date = datetime.strptime(row.data, '%d/%m/%Y') name = self._normalize_name(row.senador) nature = row.tipo_despesa cpf_cnpj = row.cnpj_cpf supplier_name = row.fornecedor docnumber = row.documento expensed = row.valor_reembolsado # FIXME: WTF? if isinstance(expensed, unicode): expensed = float(expensed.replace(',', '.').replace('\r', '').replace('\n', '')) # memory cache expense_nature = natures.get(nature) if not expense_nature: expense_nature, _ = ExpenseNature.objects.get_or_create(name=nature) natures[nature] = expense_nature supplier = self.get_or_create_supplier(cpf_cnpj, supplier_name) # memory cache legislator = legislators.get(name) if not legislator: legislator = self._get_or_create_legislator(name) legislators[name] = legislator # memory cache mandate = mandates.get(name) if not mandate: mandate = self.mandate_for_legislator(legislator, None) mandates[name] = mandate expense = ArchivedExpense( number=docnumber, nature=expense_nature, date=expense_date, expensed=expensed, mandate=mandate, supplier=supplier, collection_run=self.collection_run ) archived_expense_list.append(expense) self.debug(u'New expense found: {0}'.format(unicode(expense))) objects_counter += 1 archived_expense_list_counter -= 1 # We create a list with up to OBJECT_LIST_MAXIMUM_COUNTER. # If that lists is equal to the maximum object count allowed # or if there are no more objects in archived_expense_list, # we bulk_create() them and clear the list. if objects_counter == OBJECT_LIST_MAXIMUM_COUNTER or archived_expense_list_counter == 0: ArchivedExpense.objects.bulk_create(archived_expense_list) archived_expense_list[:] = [] objects_counter = 0 reset_queries()
import rows from io import BytesIO print '------\nReading file\n-----' filename = 'data/PERM_Disclosure_Data_FY16_Q2_shorter.csv' # shorter csv f = open(filename,'r') filedata = f.read() f.close() filedata = filedata.replace('2007_NAICS_US_CODE', 'COLUMN_2007_NAICS_US_CODE') filedata = filedata.replace('2007_NAICS_US_TITLE', 'COLUMN_2007_NAICS_US_TITLE') visas_data = rows.import_from_csv(BytesIO(filedata)) print 'Hey, rows automatically identified the types:' for field_name, field_type in visas_data.fields.items(): print '{} is {}'.format(field_name, field_type) print '------\nStart analysis\n-----' certified = filter(lambda row: row.case_status == 'Certified', visas_data) denied = filter(lambda row: row.case_status == 'Denied', visas_data) print 'Certified vs Denied: {} vs {}'.format(len(certified), len(denied)) developer_code = "15-1133" developers = filter(lambda row: row.pw_soc_code == developer_code, visas_data) non_developers = filter(lambda row: row.pw_soc_code != developer_code, visas_data) print 'Devs vs Non-Devs: {} vs {}'.format(len(developers), len(non_developers)) developers_certified = filter(lambda row: row.case_status == 'Certified' and row.pw_soc_code == developer_code, visas_data) developers_denied = filter(lambda row: row.case_status == 'Denied' and row.pw_soc_code != developer_code, visas_data)
def test_real_data_1(self): filename = "tests/data/balneabilidade-26-2010" result = rows.import_from_pdf(filename + ".pdf", backend=self.backend) expected = rows.import_from_csv(filename + ".csv") self.assertEqual(list(expected), list(result))