def test_detect_dialect_using_json(self): temp = tempfile.NamedTemporaryFile(delete=False) filename = '{}.{}'.format(temp.name, self.file_extension) encoding = 'utf-8' self.files_to_delete.append(filename) # Using JSON will force the sniffer to do not include ':', '}' in the # possible delimiters table = rows.Table(fields=OrderedDict([ ('jsoncolumn1', rows.fields.JSONField), ('jsoncolumn2', rows.fields.JSONField), ])) table.append({ 'jsoncolumn1': '{"a": 42}', 'jsoncolumn2': '{"b": 43}', }) table.append({ 'jsoncolumn1': '{"c": 44}', 'jsoncolumn2': '{"d": 45}', }) rows.export_to_csv(table, filename, encoding=encoding) table = rows.import_from_csv(filename, encoding=encoding) self.assertEqual(table.field_names, ['jsoncolumn1', 'jsoncolumn2']) self.assertDictEqual(table[0].jsoncolumn1, {'a': 42}) self.assertDictEqual(table[0].jsoncolumn2, {'b': 43}) self.assertDictEqual(table[1].jsoncolumn1, {'c': 44}) self.assertDictEqual(table[1].jsoncolumn2, {'d': 45})
def parse_state_file(self, response): state = response.meta["state"] self.errors = [] try: self.parse_boletim(state, response.body) except Exception as exp: self.errors.append( ("boletim", state, f"{exp.__class__.__name__}: {exp}")) try: self.parse_caso(state, response.body) except Exception as exp: self.errors.append( ("caso", state, f"{exp.__class__.__name__}: {exp}")) if self.errors: error_counter = Counter(error[0] for error in self.errors) error_counter_str = ", ".join( f"{error_type}: {count}" for error_type, count in error_counter.items()) self.logger.error( f"{len(self.errors)} errors found when parsing {state} ({error_counter_str})" ) error_header = ("sheet", "state", "message") errors = rows.import_from_dicts( [dict(zip(error_header, row)) for row in self.errors]) rows.export_to_csv(errors, f"errors-{state}.csv") exit(255)
def test_quotes(self): temp = tempfile.NamedTemporaryFile(delete=False) filename = '{}.{}'.format(temp.name, self.file_extension) self.files_to_delete.append(filename) table = rows.Table(fields=OrderedDict([ ('field_1', rows.fields.TextField), ('field_2', rows.fields.TextField), ('field_3', rows.fields.TextField), ('field_4', rows.fields.TextField), ])) table.append({ 'field_1': '"quotes"', 'field_2': 'test "quotes"', 'field_3': '"quotes" test', 'field_4': 'test "quotes" test', }) # we need this line row since `"quotes"` on `field_1` could be # `JSONField` or `TextField` table.append({ 'field_1': 'noquotes', 'field_2': 'test "quotes"', 'field_3': '"quotes" test', 'field_4': 'test "quotes" test', }) rows.export_to_csv(table, filename) table2 = rows.import_from_csv(filename) self.assert_table_equal(table, table2)
def main(): parser = argparse.ArgumentParser() parser.add_argument("state", choices=spiders.keys()) parser.add_argument("--year", type=int) parser.add_argument("--action", type=int) parser.add_argument("--quiet", action="store_true") parser.add_argument("--headless", action="store_true") args = parser.parse_args() if args.year is None and args.action is None: actions = get_actions_for_state(args.state) elif args.action is None: actions = [ action for action in get_actions_for_state(args.state) if action.year == args.year ] else: actions = [ Action(year=args.year, code=args.action, state=args.state, name="Unknown") ] spider = spiders[args.state](headless=args.headless) for action in actions: if not args.quiet: print( f"Downloading budget execution for {action.state} ({action.code} @ {action.year})" ) table = spider.execute(action.year, action.code) output_filename = f"{action.state}-{action.year}-{action.code}.csv" rows.export_to_csv(table, output_filename) if not args.quiet: print(f" done (saved to {output_filename})") spider.close()
def parse_state_file(self, response): state = response.meta["state"] self.errors = [] try: self.parse_boletim(state, response.body) except Exception as exp: self.errors.append( ("boletim", state, f"{exp.__class__.__name__}: {exp}")) try: self.parse_caso(state, response.body) except Exception as exp: self.errors.append( ("caso", state, f"{exp.__class__.__name__}: {exp}")) if self.errors: error_counter = Counter(error[0] for error in self.errors) error_counter_str = ", ".join( f"{error_type}: {count}" for error_type, count in error_counter.items()) self.logger.error( f"{len(self.errors)} errors found when parsing {state} ({error_counter_str})" ) error_header = ("sheet", "state", "message") errors = rows.import_from_dicts( [dict(zip(error_header, row)) for row in self.errors]) filename = ERROR_PATH / f"errors-{state}.csv" if not filename.parent.exists(): filename.parent.mkdir(parents=True) rows.export_to_csv(errors, filename) # Force crawler to stop os.kill(os.getpid(), SIGINT) os.kill(os.getpid(), SIGINT) raise CloseSpider(f"Error found on {state} (see {filename}).")
def create_download_script(filename=Path('data/download.sh'), output_path=Path('data/output'), download_path=Path('data/download')): if not filename.parent.exists(): filename.parent.mkdir() if not output_path.exists(): output_path.mkdir() links = discover_links() today = datetime.datetime.now() date = f'{today.year}-{today.month:02d}-{today.day:02d}' rows.export_to_csv(links, output_path / f'links_{date}.csv') with open(filename, mode='w', encoding='utf8') as fobj: fobj.write('#!/bin/sh\n') fobj.write( f'# Arquivo gerado em {today.year}-{today.month}-{today.day}\n') fobj.write( '# Visite o site da Receita Federal para verificar se existem atualizações.\n\n' ) fobj.write('mkdir -p {}\n\n'.format(str(download_path))) for row in links: path = download_path / (row.uf + '.txt') fobj.write(f'wget -O "{path}" "{row.url}"\n') meta = os.stat(filename) os.chmod(filename, meta.st_mode | stat.S_IEXEC)
def download_and_save(names, filename): result = [] for name in names: print(name) result.append(classify_by_sex(name)) table = rows.import_from_dicts(result) rows.export_to_csv(table, filename)
def parse_state_file(self, response): meta = response.meta state = meta["state"] caso_filename = meta["caso_filename"] if response.status >= 400: self.errors[state].append( ("connection", state, f"HTTP status code: {response.status}")) else: response_data = json.load(io.BytesIO(response.body)) try: self.parse_boletim(state, response_data["reports"]) except Exception as exp: self.errors[state].append( ("boletim", state, f"{exp.__class__.__name__}: {exp}")) try: self.parse_caso(state, caso_filename, response_data["cases"]) except Exception as exp: self.errors[state].append( ("caso", state, f"{exp.__class__.__name__}: {exp}")) if self.errors[state]: error_counter = Counter(error[0] for error in self.errors[state]) error_counter_str = ", ".join( f"{error_type}: {count}" for error_type, count in error_counter.items()) self.logger.error( f"{len(self.errors[state])} errors found when parsing {state} ({error_counter_str})" ) error_header = ("sheet", "state", "message") errors = rows.import_from_dicts( [dict(zip(error_header, row)) for row in self.errors[state]]) filename = ERROR_PATH / f"errors-{state}.csv" if not filename.parent.exists(): filename.parent.mkdir(parents=True) rows.export_to_csv(errors, filename)
def main(): parser = argparse.ArgumentParser() parser.add_argument('html_entrada') parser.add_argument('csv_saida') args = parser.parse_args() table = sum_iof_into_entries(html_to_table(args.html_entrada)) rows.export_to_csv(table, args.csv_saida)
def test_export_to_csv_filename(self): # TODO: may test file contents temp = tempfile.NamedTemporaryFile(delete=False) self.files_to_delete.append(temp.name) rows.export_to_csv(utils.table, temp.name) table = rows.import_from_csv(temp.name) self.assert_table_equal(table, utils.table)
def create_final_headers(header_type, order_columns, final_filename): final_headers = {} filenames = sorted( [ (REGEXP_HEADER_YEAR.findall(filename)[0], filename) for filename in glob(str(settings.HEADERS_PATH / f"{header_type}-*.csv")) if REGEXP_HEADER_YEAR.findall(filename) ] ) # TODO: check if schema is according to final header. if there are diffs, # warn user. for index, (header_year, filename) in enumerate(filenames): header = read_header(filename) for row in header: if not row.nome_final: continue if row.nome_final not in final_headers: row_data = row._asdict() if index > 0: row_data["introduced_on"] = header_year row_data["original_names"] = [(header_year, row_data.pop("nome_tse"))] final_headers[row.nome_final] = row_data else: original_name = (header_year, row.nome_tse) original_names = final_headers[row.nome_final]["original_names"] should_add = True for original in original_names: if original[1] == original_name[1]: should_add = False break if should_add: original_names.append(original_name) table = rows.Table( fields=OrderedDict( [ ("nome_final", rows.fields.TextField), ("descricao", rows.fields.TextField), ] ) ) header_list = sorted( final_headers.values(), key=lambda row: order_columns(row["nome_final"]) ) for row in header_list: row_data = {"descricao": row["descricao"] or "", "nome_final": row["nome_final"]} introduced_on = row.get("introduced_on", None) original_names = ", ".join( f"{item[1]} ({item[0]})" for item in row.get("original_names") ) row_data["descricao"] += f". Aparece no TSE como: {original_names}" if introduced_on: row_data["descricao"] += f". Coluna adicionada em {introduced_on}" if row_data["descricao"][-1] != ".": row_data["descricao"] += "." table.append(row_data) rows.export_to_csv(table, final_filename)
def test_export_callback(self): table = rows.import_from_dicts([{ 'id': number } for number in range(10)]) myfunc = mock.Mock() rows.export_to_csv(table, callback=myfunc, batch_size=3) self.assertEqual(myfunc.call_count, 4) self.assertEqual([x[0][0] for x in myfunc.call_args_list], [3, 6, 9, 10])
def write_csv(self): filename = OUTPUT_PATH / Path("caso-sc.csv") # Adicionando a 'row' do estado de SC data = {} data["date"] = date.today() data["state"] = "SC" data["city"] = "" data["place_type"] = "state" data["notified"] = sum([self.data_cities[i]["notified"] for i in range(len(self.data_cities))]) data["confirmed"] = sum([self.data_cities[i]["confirmed"] for i in range(len(self.data_cities))]) data["discarded"] = sum([self.data_cities[i]["discarded"] for i in range(len(self.data_cities))]) data["suspect"] = sum([self.data_cities[i]["suspect"] for i in range(len(self.data_cities))]) data["deaths"] = sum([self.data_cities[i]["deaths"] for i in range(len(self.data_cities))]) data["city_ibge_code"] = "" data["estimated_population_2019"] = sum([self.data_cities[i]["estimated_population_2019"] for i in range(len(self.data_cities))]) data["confirmed_per_100k_inhabitants"] = sum([self.data_cities[i]["confirmed_per_100k_inhabitants"] for i in range(len(self.data_cities))]) data["death_rate"] = sum([self.data_cities[i]["death_rate"] for i in range(len(self.data_cities))]) / len(self.data_cities) data["notes"] = "" data["source_url"] = "" self.data_cities.append(data) # Adicionando as cidades que nao foram parseadas ainda for each_city in self.cities_sc_ibge: if each_city.municipio not in self.cidades_url.keys(): data = {} data["date"] = "" data["state"] = "SC" data["city"] = each_city.municipio data["place_type"] = "city" data["notified"] = "" data["confirmed"] = "" data["discarded"] = "" data["suspect"] = "" data["deaths"] = "" data["notes"] = "" data["city_ibge_code"] = "" data["estimated_population_2019"] = "" data["confirmed_per_100k_inhabitants"] = "" data["death_rate"] = "" data["source_url"] = "" self.data_cities.append(data) # keyorder = ['date', 'state', 'city', 'place_type', 'notified', 'confirmed', 'discarded', 'suspect', 'deaths', 'notes', 'city_ibge_code', 'estimated_population_2019', 'confirmed_per_100k_inhabitants', 'death_rate', 'source_url'] # # data_cities_ordered = [] # for d in self.data_cities: # data_cities_ordered.append(sorted(d.items(), key=lambda i:keyorder.index(i[0]))) rows_data = rows.import_from_dicts(self.data_cities) rows_data.order_by("city") rows.export_to_csv(rows_data, filename)
def test_import_field_limit(self): temp = tempfile.NamedTemporaryFile(delete=False) filename = "{}.{}".format(temp.name, self.file_extension) self.files_to_delete.append(filename) table = rows.import_from_dicts([{"f1": "a" * 132000}]) rows.export_to_csv(table, filename) # The following line must not raise the exception: # `_csv.Error: field larger than field limit (131072)` new = rows.import_from_csv(filename)
def test_issue_168(self): temp = tempfile.NamedTemporaryFile(delete=False) filename = "{}.{}".format(temp.name, self.file_extension) self.files_to_delete.append(filename) table = rows.Table(fields=OrderedDict([("jsoncolumn", rows.fields.JSONField)])) table.append({"jsoncolumn": '{"python": 42}'}) rows.export_to_csv(table, filename) table2 = rows.import_from_csv(filename) self.assert_table_equal(table, table2)
def export_prob_predictions_to_csv(self, filename, ids, predictions): new_rows = [] # classifier = { # 'Return_to_owner': 0, # 'Euthanasia': 1, # 'Adoption': 2, # 'Transfer': 3, # 'Died': 4 # } #print(predictions) #count = [0, 0, 0, 0] #m = [] for i, prediction in enumerate(predictions): #ID Adoption Died Euthanasia Return_to_owner Transfer # print(type(prediction)) # print(prediction == '3') # print(int(prediction == '3')) new_row = OrderedDict({}) # print(prediction[0]) # print(type(prediction[0])) # print numpy.argmax(prediction) # m.append(numpy.argmax(prediction)) # if numpy.argmax(prediction) == 0: # count[0]+=1 # if numpy.argmax(prediction) == 1: # count[1]+=1 # if numpy.argmax(prediction) == 2: # count[2]+=1 # if numpy.argmax(prediction) == 3: # count[3]+=1 new_row['ID'] = ids[i] new_row['Adoption'] = prediction[2] new_row['Died'] = prediction[4] new_row['Euthanasia'] = prediction[1] new_row['Return_to_owner'] = prediction[0] new_row['Transfer'] = prediction[3] new_rows.append(new_row) #print(count) #print(set(m)) new_rows.sort(key=lambda e: e['ID']) #print(new_rows) new_fields = [(key, rows.fields.UnicodeField) for key in new_rows[0].keys()] table_to = rows.Table(fields=OrderedDict(new_fields)) for row in new_rows: table_to.append(row) rows.export_to_csv(table_to, filename)
def test_export_to_csv_uses_serialize(self, mocked_serialize): temp = tempfile.NamedTemporaryFile(delete=False) self.files_to_delete.append(temp.name) kwargs = {"test": 123, "parameter": 3.14} mocked_serialize.return_value = iter([utils.table.fields.keys()]) rows.export_to_csv(utils.table, temp.name, encoding="utf-8", **kwargs) self.assertTrue(mocked_serialize.called) self.assertEqual(mocked_serialize.call_count, 1) call = mocked_serialize.call_args self.assertEqual(call[0], (utils.table,)) self.assertEqual(call[1], kwargs)
def test_export_to_csv_filename(self): # TODO: may test file contents temp = tempfile.NamedTemporaryFile(delete=False) self.files_to_delete.append(temp.name) rows.export_to_csv(utils.table, temp.name) table = rows.import_from_csv(temp.name) self.assert_table_equal(table, utils.table) temp.file.seek(0) result = temp.file.read() export_in_memory = rows.export_to_csv(utils.table, None) self.assertEqual(result, export_in_memory)
def test_export_to_csv_uses_serialize(self, mocked_serialize): temp = tempfile.NamedTemporaryFile(delete=False) self.files_to_delete.append(temp.name) kwargs = {'test': 123, 'parameter': 3.14, } mocked_serialize.return_value = iter([utils.table.fields.keys()]) rows.export_to_csv(utils.table, temp.name, encoding='utf-8', **kwargs) self.assertTrue(mocked_serialize.called) self.assertEqual(mocked_serialize.call_count, 1) call = mocked_serialize.call_args self.assertEqual(call[0], (utils.table, )) self.assertEqual(call[1], kwargs)
def main(): now = datetime.datetime.now() today = datetime.date(now.year, now.month, now.day) download_path = pathlib.Path('download') output_path = pathlib.Path('output') if not download_path.exists(): download_path.mkdir() if not output_path.exists(): output_path.mkdir() # Get spreadsheet links links = get_links(date=today) rows.export_to_csv(links, output_path / f'links-{today}.csv') # Download all the links result = [] for link in links: print(link.name) filename = download_path / urlparse(link.url).path.split('/')[-1] # Download file print(f' Downloading ({link.url})...', end='', flush=True) if filename.exists(): print(f' already downloaded.') else: try: download(link.url, filename) except RuntimeError as exception: print(f' {exception.args[0]}') continue else: print(' done.') # Extract data print(f' Extracting ({filename})...', end='', flush=True) try: data = extract(filename, link) except Exception as exp: import traceback print(f' ERROR! {traceback.format_exc().splitlines()[-1]}') else: print(f' done (rows extracted: {len(data)}).') result.extend(data) # Extract everything to a new CSV output = output_path / f'salarios-magistrados-{today}.csv' print(f'Extracting result to {output}...') export_csv(result, output)
def pdf_to_csv(input_filename, output_filename): total_pages = rows.plugins.pdf.number_of_pages(input_filename) pdf = rows.plugins.pdf.PyMuPDFBackend(input_filename) result = [] for page_number in range(1, total_pages + 1): page = list(next(pdf.objects(page_numbers=(page_number, )))) data = list(rows.plugins.utils.ipartition(page, 4)) header = [obj.text for obj in data[0]] for row in data[1:]: row = dict(zip(header, [obj.text for obj in row])) row["codigo_ibge"] = row.pop("IBGE") row["perfil"] = row.pop("Perfil Município") result.append(row) table = rows.import_from_dicts(result) rows.export_to_csv(table, output_filename)
def test_import_from_xpath_filename(self): table = rows.import_from_xpath(self.filename, encoding=self.encoding, **self.kwargs) expected_meta = {'imported_from': 'xpath', 'filename': self.filename,} self.assertEqual(table.meta, expected_meta) temp = tempfile.NamedTemporaryFile(delete=False) self.files_to_delete.append(temp.name) fobj = temp.file rows.export_to_csv(table, fobj) fobj.seek(0) table = rows.import_from_csv(fobj) self.assert_table_equal(table, self.expected_table)
def __call__(self): view = getMultiAdapter((self.context, self.request), name='view') table = view.table() filename = "%s.csv" % view.filename_prefix() data = rows.export_to_csv(table) self.request.response.setHeader('Content-Type', '"%s"' % EXTENSIONS_TYPES.get('csv')) self.request.response.setHeader('Content-Disposition', 'attachment; filename="%s"' % filename) return data
def convert(state, input_filename, output_filename): table = rows.import_from_csv( input_filename, force_types={ "confirmed": rows.fields.IntegerField, "deaths": rows.fields.IntegerField, }, ) state_cities = ["TOTAL NO ESTADO", "Importados/Indefinidos"] + sorted( row.municipio for row in cities if row.uf == state ) confirmed, deaths, dates = {}, {}, [] for row in table: row_confirmed = row.confirmed or 0 row_date = row.date row_deaths = row.deaths or 0 row_name = row.city if row.place_type == "city" else "TOTAL NO ESTADO" if row_name not in state_cities: print(f"ERRO: município {repr(row_name)} não encontrado.") continue if row_confirmed == 0 and row_deaths == 0: # No data for this city in this day continue if row_date not in confirmed: confirmed[row_date] = {} if row_date not in deaths: deaths[row_date] = {} if row_name in confirmed[row_date] or row_name in deaths[row_date]: print(f"ERRO: conflito em {repr(row_name)} para {row_date}.") continue confirmed[row_date][row_name] = row_confirmed deaths[row_date][row_name] = row_deaths result = [] dates = sorted(confirmed.keys(), reverse=True) for city in state_cities: row = {"municipio": city} for date in dates: date_str = f"{date.day:02d}_{date.month:02d}" row[f"confirmados_{date_str}"] = confirmed[date].get(city, None) row[f"mortes_{date_str}"] = deaths[date].get(city, None) result.append(row) rows.export_to_csv(rows.import_from_dicts(result), output_filename)
def main(): now = datetime.datetime.now() today = datetime.date(now.year, now.month, now.day) download_path = pathlib.Path('download') output_path = pathlib.Path('output') if not download_path.exists(): download_path.mkdir() if not output_path.exists(): output_path.mkdir() # Get spreadsheet links links = get_links(date=today) rows.export_to_csv(links, output_path / f'links-{today}.csv') # Download all the links filenames = [] for link in links: save_path = download_path / urlparse(link.url).path.split('/')[-1] filenames.append(save_path) if not save_path.exists(): print(f'Downloading {link.url}...', end='', flush=True) download(link.url, save_path) print(' done.') else: print(f'Skipping {save_path.name}...') # Extract data from all the spreadsheets result = [] for filename in filenames: print(f'Extracting {filename.name}...', end='', flush=True) try: data = extract(filename) except Exception as exp: import traceback print(f' ERROR! {traceback.format_exc().splitlines()[-1]}') else: print(' done.') result.extend(data) # Extract everything to a new CSV output = output_path / f'salarios-magistrados-{today}.csv' print(f'Extracting result to {output}...') export_csv(result, output)
def test_import_from_xpath_fobj(self): # TODO: may test with codecs.open passing an encoding with open(self.filename, mode='rb') as fobj: table = rows.import_from_xpath(fobj, encoding=self.encoding, **self.kwargs) expected_meta = {'imported_from': 'xpath', 'filename': self.filename, 'encoding': self.encoding, } self.assertEqual(table.meta, expected_meta) temp = tempfile.NamedTemporaryFile(delete=False) self.files_to_delete.append(temp.name) fobj = temp.file rows.export_to_csv(table, fobj) fobj.seek(0) table = rows.import_from_csv(fobj) self.assert_table_equal(table, self.expected_table)
def download_game_data_for_country(path, year, country_code): 'Download country athlete data for a specific year if not downloaded yet' filename = path.joinpath(_make_filename(year, country_code)) if filename.exists(): print(' (already downloaded, skipping)') return url = URL_DATA.format(year=year, country_code=country_code) response = requests.get(url) if '404' in response.url: # country didn't played this year print(" (didn't play this year, skipping)") return html = response.content table = rows.import_from_html(BytesIO(html), encoding='utf-8', fields=FIELDS) rows.export_to_csv(table, str(filename.absolute()), encoding='utf-8') print(' ok')
def export_exact_predictions_to_csv(self, filename, ids, predictions): new_rows = [] # classifier = { # 'Return_to_owner': 0, # 'Euthanasia': 1, # 'Adoption': 2, # 'Transfer': 3, # 'Died': 4 # } for i, prediction in enumerate(predictions): #ID Adoption Died Euthanasia Return_to_owner Transfer # print(type(prediction)) # print(prediction == '3') # print(int(prediction == '3')) new_row = OrderedDict({}) new_row['ID'] = ids[i] new_row['Adoption'] = int(prediction == '2') new_row['Died'] = int(prediction == '4') new_row['Euthanasia'] = int(prediction == '1') new_row['Return_to_owner'] = int(prediction == '0') new_row['Transfer'] = int(prediction == '3') new_rows.append(new_row) new_rows.sort(key=lambda e: e['ID']) #print(new_rows) new_fields = [(key, rows.fields.UnicodeField) for key in new_rows[0].keys()] table_to = rows.Table(fields=OrderedDict(new_fields)) for row in new_rows: table_to.append(row) rows.export_to_csv(table_to, filename)
def parse_licitacao(year, city_code): # TODO: adicionar codibge, nome municipio e ano nos 3 arquivos filename = DOWNLOAD_PATH / f"{year}_{city_code}_Licitacao.zip" result1 = parse(filename, "licitacao") rows.export_to_csv(result1, OUTPUT_PATH / f"licitacao-{city_code}-{year}.csv") result2 = parse(filename, "licitacao_participante") rows.export_to_csv( result2, OUTPUT_PATH / f"licitacao-participante-{city_code}-{year}.csv" ) result3 = parse(filename, "licitacao_vencedor") rows.export_to_csv( result3, OUTPUT_PATH / f"licitacao-vencedor-{city_code}-{year}.csv" )
if row.animaltype == 'Dog': new_row.update(get_dog_age_columns(row)) # Demora muito #new_row.update(get_dog_breed_columns(row)) #new_row.update(get_dog_color_columns(row)) new_row['outcome'] = get_animal_outcome(row) new_dog_rows.append(new_row) new_fields = [(key, rows.fields.UnicodeField) for key in new_cat_rows[0].keys()] table_to = rows.Table(fields=OrderedDict(new_fields)) for row in new_cat_rows: table_to.append(row) rows.export_to_csv(table_to, "clean_data3_no_color_no_breed_cat.csv") new_fields = [(key, rows.fields.UnicodeField) for key in new_dog_rows[0].keys()] table_to = rows.Table(fields=OrderedDict(new_fields)) for row in new_dog_rows: table_to.append(row) rows.export_to_csv(table_to, "clean_data3_no_color_no_breed_dog.csv") ################## # Limpando dados de teste ################## table_from = rows.import_from_csv("../test.csv")
import requests import rows url = "http://balneabilidade.inema.ba.gov.br/index.php/relatoriodebalneabilidade/geraBoletim?idcampanha=42041" print("*** Downloading PDF...") response = requests.get(url) # The line below will automatically identify the table in all PDF pages - it # works for this file but not for all cases. You can be more specific defining # the page numbers, a start/end string (like the header/footer strings) and # also change the table identification algorithm. Check `backend`, `algorithm`, # `starts_after`, `ends_before` and `page_numbers` parameters. # For this simple case you could also install rows' CLI (`pip install # rows[cli]`) and run: `rows print <url>` table = rows.import_from_pdf(io.BytesIO(response.content)) rows.export_to_csv(table, "beach-data.csv") print("*** Table exported to beach-data.csv") print("*** Extracted table:") print(rows.export_to_txt(table)) # You could also iterate over the object, like: # for row in table: print(row) print("\n\n*** Extracted text:") text_pages = rows.plugins.pdf.pdf_to_text(io.BytesIO(response.content)) print("\n\n".join(text_pages))
# Get data from Portuguese Wikipedia city_list_url = 'https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_do_Brasil' response = requests.get(city_list_url) html = response.content # Extract desired data using XPath cities = rows.import_from_xpath( BytesIO(html), rows_xpath='//table/tr/td/ul/li', fields_xpath=OrderedDict([('name', './/text()'), ('link', './/a/@href')])) regexp_city_state = re.compile(r'(.*) \(([A-Z]{2})\)') def transform(row, table): 'Transform row "link" into full URL and add "state" based on "name"' data = row._asdict() data['link'] = urlparse.urljoin('https://pt.wikipedia.org', data['link']) data['name'], data['state'] = regexp_city_state.findall(data['name'])[0] return data new_fields = OrderedDict() new_fields['name'] = cities.fields['name'] new_fields['state'] = rows.fields.TextField # new field new_fields['link'] = cities.fields['link'] cities = rows.transform(new_fields, transform, cities) rows.export_to_csv(cities, 'brazilian-cities.csv')
table_2 = rows.Table(fields=new_fields) for row in table_1: if row.sexuponoutcome and row.sexuponoutcome != u'Unknown': castration, sex = row.sexuponoutcome.split() else: castration, sex = u'Unknown', u'Unknown' week_day = calendar.day_name[row.datetime.weekday()] us_holidays = holidays.UnitedStates() holiday = row.datetime in us_holidays age_in_days = get_animal_age(row.ageuponoutcome) if row.animaltype == "Cat": age_group = get_cat_age_group(age_in_days) else: age_group = get_dog_age_group(age_in_days) table_2.append({'animalid': row.animalid, 'name': row.name, 'datetime': week_day, 'holiday': holiday, 'outcometype': row.outcometype, 'outcomesubtype': row.outcomesubtype, 'animaltype': row.animaltype, 'sex': sex, 'castration': castration, 'agegroup': age_group, 'breed': row.breed, 'color': row.color}) rows.export_to_csv(table_2, "clean_data.csv")
# coding: utf-8 from __future__ import unicode_literals import os from collections import OrderedDict import rows # taken from: # http://www.supercom.gob.ec/es/informate-y-participa/directorio-de-medios/21-radiodifusoras filename = os.path.join( os.path.dirname(__file__), "../../tests/data/ecuador-medios-radiodifusoras.html" ) rows_xpath = '//*[@class="entry-container"]/*[@class="row-fluid"]/*[@class="span6"]' fields_xpath = OrderedDict( [ ("url", ".//h2/a/@href"), ("name", ".//h2/a/text()"), ("address", './/div[@class="spField field_direccion"]/text()'), ("phone", './/div[@class="spField field_telefono"]/text()'), ("website", './/div[@class="spField field_sitio_web"]/text()'), ("email", './/div[@class="spField field_email"]/text()'), ] ) table = rows.import_from_xpath(filename, rows_xpath, fields_xpath) rows.export_to_csv(table, "ecuador-radiodifusoras.csv")
# coding: utf-8 from __future__ import unicode_literals import os from collections import OrderedDict import rows # taken from: # http://www.supercom.gob.ec/es/informate-y-participa/directorio-de-medios/21-radiodifusoras filename = os.path.join(os.path.dirname(__file__), '../../tests/data/ecuador-medios-radiodifusoras.html') rows_xpath = '//*[@class="entry-container"]/*[@class="row-fluid"]/*[@class="span6"]' fields_xpath = OrderedDict([ ('url', './/h2/a/@href'), ('name', './/h2/a/text()'), ('address', './/div[@class="spField field_direccion"]/text()'), ('phone', './/div[@class="spField field_telefono"]/text()'), ('website', './/div[@class="spField field_sitio_web"]/text()'), ('email', './/div[@class="spField field_email"]/text()'), ]) table = rows.import_from_xpath(filename, rows_xpath, fields_xpath) rows.export_to_csv(table, 'ecuador-radiodifusoras.csv')
drow = row.__dict__ for key, value in drow.items(): if key not in statistics: statistics[key] = {} if value not in statistics[key]: statistics[key][value] = 0 statistics[key][value] += 1 string = rows.fields.UnicodeField columns = {} columns['value'] = string for key in statistics.keys(): for value in statistics[key].keys(): columns[key + '_' + value] = string table_2d_analize = rows.Table(fields=columns) drows = map(lambda r: r.__dict__, table) for key in statistics.keys(): for value in statistics[key].keys(): data = {} data['value'] = key + '_' + value for key2 in statistics.keys(): for value2 in statistics[key2].keys(): data[key2 + '_' + value2] = len(filter(lambda d: d[key] == value and d[key2] == value2, drows)) table_2d_analize.append(data) rows.export_to_csv(table_2d_analize, '2d_column_analize.csv')
def test_export_to_csv_accepts_dialect(self): result_1 = rows.export_to_csv(utils.table, dialect=csv.excel_tab) result_2 = rows.export_to_csv(utils.table, dialect=csv.excel) self.assertEqual(result_1.replace(b'\t', b','), result_2)
if row.animaltype == 'Dog': new_row.update(get_dog_age_columns(row)) # Demora muito new_row.update(get_dog_breed_columns(row)) new_row.update(get_dog_color_columns(row)) new_row['outcome'] = get_animal_outcome(row) new_dog_rows.append(new_row) new_fields = [(key, rows.fields.UnicodeField) for key in new_cat_rows[0].keys()] table_to = rows.Table(fields=OrderedDict(new_fields)) for row in new_cat_rows: table_to.append(row) rows.export_to_csv(table_to, "clean_data3_cat.csv") new_fields = [(key, rows.fields.UnicodeField) for key in new_dog_rows[0].keys()] table_to = rows.Table(fields=OrderedDict(new_fields)) for row in new_dog_rows: table_to.append(row) rows.export_to_csv(table_to, "clean_data3_dog.csv") ################## # Limpando dados de teste ################## table_from = rows.import_from_csv("../test.csv")
# Get data from Portuguese Wikipedia city_list_url = "https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_do_Brasil" response = requests.get(city_list_url) html = response.content # Extract desired data using XPath cities = rows.import_from_xpath( BytesIO(html), rows_xpath="//table/tr/td/ul/li", fields_xpath=OrderedDict([("name", ".//text()"), ("link", ".//a/@href")]), ) regexp_city_state = re.compile(r"(.*) \(([A-Z]{2})\)") def transform(row, table): 'Transform row "link" into full URL and add "state" based on "name"' data = row._asdict() data["link"] = urljoin("https://pt.wikipedia.org", data["link"]) data["name"], data["state"] = regexp_city_state.findall(data["name"])[0] return data new_fields = OrderedDict() new_fields["name"] = cities.fields["name"] new_fields["state"] = rows.fields.TextField # new field new_fields["link"] = cities.fields["link"] cities = rows.transform(new_fields, transform, cities) rows.export_to_csv(cities, "brazilian-cities.csv")
def csv_claims_by_state(): counter = count_claims_by_state() result = import_from_dicts(counter) result.order_by('label') return rows.export_to_csv(result)
def test_export_callback(self): table = rows.import_from_dicts([{"id": number} for number in range(10)]) myfunc = mock.Mock() rows.export_to_csv(table, callback=myfunc, batch_size=3) self.assertEqual(myfunc.call_count, 4) self.assertEqual([x[0][0] for x in myfunc.call_args_list], [3, 6, 9, 10])
if value not in statistics[key]: statistics[key][value] = 0 statistics[key][value] += 1 string = rows.fields.UnicodeField table_output = rows.Table(fields=OrderedDict({'column': string, 'value': string, 'amount': string, 'percent': string})) for key in statistics.keys(): for value in statistics[key].keys(): table_output.append({ 'column': key, 'value': value, 'amount': statistics[key][value], 'percent': "{0:.2f}".format(statistics[key][value] / quantidade_de_exemplos * 100) }) rows.export_to_csv(table_output, '1d_column_analize.csv') # columns = {} # columns['value'] = string # for key in statistics.keys(): # for value in statistics[key].keys(): # columns[key + '_' + value] = string # table_2d_analize = rows.Table(fields=columns) # drows = map(lambda r: r.__dict__, table) # for key in statistics.keys(): # for value in statistics[key].keys(): # data = {} # data['value'] = key + '_' + value # for key2 in statistics.keys(): # for value2 in statistics[key2].keys():
import rows extract_links = rows.plugins.html.extract_links extract_text = rows.plugins.html.extract_text # Get the HTML url = "http://wnpp.debian.net/" response = requests.get(url) html = response.content # Import data, preserving cell's HTML packages = rows.import_from_html(BytesIO(html), index=10, preserve_html=True) def transform(row, table): 'Extract links from "project" field and remove HTML from all' data = row._asdict() data["links"] = " ".join(extract_links(row.project)) for key, value in data.items(): if isinstance(value, six.text_type): data[key] = extract_text(value) return data new_fields = packages.fields.copy() new_fields["links"] = rows.fields.TextField packages = rows.transform(new_fields, transform, packages) rows.export_to_csv(packages, "debian-wnpp.csv")
import requests import rows import six extract_links = rows.plugins.html.extract_links extract_text = rows.plugins.html.extract_text # Get the HTML url = 'http://wnpp.debian.net/' response = requests.get(url) html = response.content # Import data, preserving cell's HTML packages = rows.import_from_html(BytesIO(html), index=10, preserve_html=True) def transform(row, table): 'Extract links from "project" field and remove HTML from all' data = row._asdict() data['links'] = ' '.join(extract_links(row.project)) for key, value in data.items(): if isinstance(value, six.text_type): data[key] = extract_text(value) return data new_fields = packages.fields.copy() new_fields['links'] = rows.fields.TextField packages = rows.transform(new_fields, transform, packages) rows.export_to_csv(packages, 'debian-wnpp.csv')
def csv_claims_by_tag(): counter = count_claims_by_tag() result = import_from_dicts(counter) result.order_by('count') return rows.export_to_csv(result)
('animaltype', rows.fields.UnicodeField), ('sex', rows.fields.UnicodeField), ('castration', rows.fields.UnicodeField), #('ageuponoutcome', rows.fields.UnicodeField), #('breed', rows.fields.UnicodeField), #('color', rows.fields.UnicodeField) ]) table_3 = rows.Table(fields=table3_fields) for row in table_2: if len(row.name): has_name = 'Yes' else: has_name = 'No' if (row.datetime == "Sunday" or row.datetime == "Saturday" or row.holiday == 'True'): free_day = True else: free_day = False table_3.append({ 'has_name': has_name, 'free_day': free_day, 'outcometype': row.outcometype, 'animaltype': row.animaltype, 'sex': row.sex, 'castration': row.castration, }) rows.export_to_csv(table_3, "clean_data2.csv")