Beispiel #1
0
 def test_join_feature(self):
     tables = [rows.import_from_csv('tests/data/to-merge-1.csv'),
               rows.import_from_csv('tests/data/to-merge-2.csv'),
               rows.import_from_csv('tests/data/to-merge-3.csv'),]
     merged = rows.join(keys=('id', 'username'), tables=tables)
     expected = rows.import_from_csv('tests/data/merged.csv')
     self.assert_table_equal(merged, expected)
Beispiel #2
0
    def test_real_data_3(self):
        filename = "tests/data/eleicoes-tcesp-161-162.pdf"
        expected1 = "tests/data/expected-eleicoes-tcesp-161-{}.csv".format(self.backend)
        expected2 = "tests/data/expected-eleicoes-tcesp-162-{}.csv".format(self.backend)
        begin = re.compile("Documento gerado em.*")
        end = re.compile("Página: [0-9]+ de.*")

        result = rows.import_from_pdf(
            filename,
            backend=self.backend,
            page_numbers=(1,),
            starts_after=begin,
            ends_before=end,
            algorithm="header-position",
        )
        expected = rows.import_from_csv(expected1)
        self.assertEqual(list(expected), list(result))

        result = rows.import_from_pdf(
            filename,
            backend=self.backend,
            page_numbers=(2,),
            starts_after=begin,
            ends_before=end,
            algorithm="header-position",
        )
        expected = rows.import_from_csv(expected2)
        self.assertEqual(list(expected), list(result))
Beispiel #3
0
    def test_real_data_3(self):
        filename = "tests/data/eleicoes-tcesp-161-162.pdf"
        expected1 = "tests/data/expected-eleicoes-tcesp-161-{}.csv".format(
            self.backend)
        expected2 = "tests/data/expected-eleicoes-tcesp-162-{}.csv".format(
            self.backend)
        begin = re.compile("Documento gerado em.*")
        end = re.compile("Página: [0-9]+ de.*")

        result = rows.import_from_pdf(
            filename,
            backend=self.backend,
            page_numbers=(1, ),
            starts_after=begin,
            ends_before=end,
            algorithm="header-position",
        )
        expected = rows.import_from_csv(expected1)
        self.assertEqual(list(expected), list(result))

        result = rows.import_from_pdf(
            filename,
            backend=self.backend,
            page_numbers=(2, ),
            starts_after=begin,
            ends_before=end,
            algorithm="header-position",
        )
        expected = rows.import_from_csv(expected2)
        self.assertEqual(list(expected), list(result))
Beispiel #4
0
 def test_join_feature(self):
     tables = [
         rows.import_from_csv('tests/data/to-merge-1.csv'),
         rows.import_from_csv('tests/data/to-merge-2.csv'),
         rows.import_from_csv('tests/data/to-merge-3.csv'),
     ]
     merged = rows.join(keys=('id', 'username'), tables=tables)
     expected = rows.import_from_csv('tests/data/merged.csv')
     self.assert_table_equal(merged, expected)
Beispiel #5
0
    def test_import_from_csv_discover_dialect(self, mocked_create_table):
        data, lines = make_csv_data(quote_char="'",
                                    field_delimiter=";",
                                    line_delimiter="\r\n")
        fobj = BytesIO()
        fobj.write(lines.encode('utf-8'))
        fobj.seek(0)

        rows.import_from_csv(fobj)
        call_args = mocked_create_table.call_args_list[0]
        self.assertEqual(data, list(call_args[0][0]))
    def test_import_from_csv_discover_dialect(self, mocked_create_table):
        data, lines = make_csv_data(quote_char="'",
                                    field_delimiter=";",
                                    line_delimiter="\r\n")
        fobj = BytesIO()
        fobj.write(lines.encode('utf-8'))
        fobj.seek(0)

        rows.import_from_csv(fobj)
        call_args = mocked_create_table.call_args_list[0]
        self.assertEqual(data, list(call_args[0][0]))
Beispiel #7
0
    def test_import_from_csv_retrieve_desired_data(self, mocked_create_table):
        mocked_create_table.return_value = 42

        # import using filename
        table_1 = rows.import_from_csv(self.filename)
        call_args = mocked_create_table.call_args_list[0]
        self.assert_create_table_data(call_args)

        # import using fobj
        with open(self.filename, 'rb') as fobj:
            table_2 = rows.import_from_csv(fobj)
            call_args = mocked_create_table.call_args_list[1]
            self.assert_create_table_data(call_args)
    def test_import_from_csv_retrieve_desired_data(self, mocked_create_table):
        mocked_create_table.return_value = 42

        # import using filename
        rows.import_from_csv(self.filename)
        call_args = mocked_create_table.call_args_list[0]
        self.assert_create_table_data(call_args)

        # import using fobj
        with open(self.filename, 'rb') as fobj:
            rows.import_from_csv(fobj)
            call_args = mocked_create_table.call_args_list[1]
            self.assert_create_table_data(call_args)
    def test_both_confirmed_cases_and_deaths_columns_must_be_filled(self):
        original_content = self.content

        # missing confirmed cases
        self.content = original_content.replace("Abatiá,9,1", "Abatiá,,1")
        file_rows = rows.import_from_csv(self.file_from_content)
        with pytest.raises(SpreadsheetValidationErrors):
            format_spreadsheet_rows_as_dict(file_rows, self.date, self.uf)

        # missing deaths
        self.content = original_content.replace("Abatiá,9,1", "Abatiá,9,")
        file_rows = rows.import_from_csv(self.file_from_content)
        with pytest.raises(SpreadsheetValidationErrors):
            format_spreadsheet_rows_as_dict(file_rows, self.date, self.uf)
    def test_both_confirmed_cases_and_deaths_columns_must_be_integers(self):
        original_content = self.content

        # confirmed cases as float
        self.content = original_content.replace("Abatiá,9,1", "Abatiá,9.10,1")
        file_rows = rows.import_from_csv(self.file_from_content)
        with pytest.raises(SpreadsheetValidationErrors):
            format_spreadsheet_rows_as_dict(file_rows, self.date, self.uf)

        # deaths as float
        self.content = original_content.replace("Abatiá,9,1", "Abatiá,9,1.10")
        file_rows = rows.import_from_csv(self.file_from_content)
        with pytest.raises(SpreadsheetValidationErrors):
            format_spreadsheet_rows_as_dict(file_rows, self.date, self.uf)
Beispiel #11
0
def merge_files(filenames, output):
    'Merge all game files into one CSV file, adding year and country columns'

    if not output.parent.exists():
        output.parent.mkdir()

    countries_by_code = download_countries()
    games = rows.Table(fields=FULL_FIELDS)

    for filename in filenames:
        year, country_code = _parse_filename(filename)
        country = countries_by_code[country_code]
        print('Merging year: {}, country: {}...'.format(year, country.name))
        game = rows.import_from_csv(str(filename.absolute()),
                                    fields=FIELDS,
                                    dialect=csv.excel,
                                    encoding='utf-8')
        for row in game:
            data = row._asdict()
            data['year'] = year
            data['country_code'] = country_code
            data['country_name'] = country.name
            del data['rk']
            games.append(data)
    games.order_by('-year')
    rows.utils.export_to_uri(games, str(output.absolute()))
Beispiel #12
0
    def test_not_valid_if_sum_of_deaths_does_not_matches_with_total(self):
        self.content = self.content.replace('TOTAL NO ESTADO,102,32',
                                            'TOTAL NO ESTADO,102,50')
        file_rows = rows.import_from_csv(self.file_from_content)

        with pytest.raises(SpreadsheetValidationErrors):
            format_spreadsheet_rows_as_dict(file_rows, self.date, self.uf)
Beispiel #13
0
def from_idh_csv(filename):
    entries = rows.import_from_csv(filename)
    # lista de codes
    codes = [
        e.code.strip()
        for e in sorted(entries, key=lambda x: x.idh, reverse=True)
    ]
    #lista de language
    languages = [
        e.language.strip()
        for e in sorted(entries, key=lambda x: x.idh, reverse=True)
    ]

    codelang = remove_consecutives({'codes': codes, 'languages': languages})

    chain_str = ' - '.join(codes)
    cprint.ok(f"Translation chain: {chain_str}.")
    cprint.ok(f"Input text: {TEXT}\n")

    start_codelang = {'codes': 'pt', 'languages': 'Portuguese'}
    text, result = chain_translate_text(TEXT,
                                        start_codelang,
                                        codelang,
                                        monitoring=False)

    cprint.ok("\n##### RESULTS ######\n")
    cprint.ok(text)
    print()
    cprint.ok(result)
Beispiel #14
0
 def prepara_data_table(self):
     try:
         data = rows.import_from_csv(self.csv_file)
     except Exception as e:
         print('Arquivo csv inválido, ou não existe')
         raise e
     return data
def find_unfinished():
    unfinished = []
    subtitles = rows.import_from_csv(CSV_FILE)
    for s in subtitles:
        if s.field_4 == NAME and s.field_5 != '已完成':
            unfinished.append(s)
    return unfinished
Beispiel #16
0
    def test_detect_weird_dialect(self):
        temp = tempfile.NamedTemporaryFile(delete=False)
        filename = "{}.{}".format(temp.name, self.file_extension)
        self.files_to_delete.append(filename)

        # If the sniffer reads only the first line, it will think the delimiter
        # is ',' instead of ';'
        encoding = "utf-8"
        data = BytesIO(
            textwrap.dedent(
                """
            field1|field2|field3|field4
            1|2|3|4
            5|6|7|8
            9|0|1|2
            """
            )
            .strip()
            .encode(encoding)
        )

        table = rows.import_from_csv(data, encoding=encoding, lazy=False)
        self.assertEqual(table.field_names, ["field1", "field2", "field3", "field4"])

        expected = [[1, 2, 3, 4], [5, 6, 7, 8], [9, 0, 1, 2]]
        for expected_data, row in zip(expected, table):
            row = [row.field1, row.field2, row.field3, row.field4]
            self.assertEqual(expected_data, row)
    def form_valid(self, form):
        registros = rows.import_from_csv(form.cleaned_data['arquivo'])
        identificador = uuid4()
        for registro in registros:
            cliente = Cliente.objects.get_or_create(
                nome=registro.purchaser_name)[0]
            comerciante = Comerciante.objects.get_or_create(
                nome=registro.merchant_name)[0]
            endereco_comerciante = comerciante.enderecos.get_or_create(
                endereco=registro.merchant_address)[0]
            produto = Produto.objects.get_or_create(
                descricao=registro.item_description)[0]

            Venda.objects.create(cliente=cliente,
                                 comerciante=comerciante,
                                 endereco_comerciante=endereco_comerciante,
                                 produto=produto,
                                 valor_unitario=registro.item_price,
                                 quantidade=registro.purchase_count,
                                 identificador=identificador)

        total = number_format(Venda.objects.receita_bruta_total(identificador))
        messages.success(
            self.request,
            f'Importação realizada. Receita Bruta Total: {total}.')
        return super().form_valid(form)
Beispiel #18
0
    def test_skip_sum_validations_if_flagged_in_the_form_data(
            self, mocked_format):
        mocked_format.return_value = (["results",
                                       "list"], ["warnings", "list"])
        self.data.update({
            "skip_sum_cases": True,
            "skip_sum_deaths": True,
        })
        form = StateSpreadsheetForm(self.data, self.file_data, user=self.user)
        assert form.is_valid(), form.errors
        expected = {
            "table": ["results", "list"],
            "errors": [],
            "warnings": ["warnings", "list"],
        }

        spreadsheet = form.save()
        spreadsheet.refresh_from_db()

        assert expected == spreadsheet.data
        assert 1 == mocked_format.call_count
        method_call = mocked_format.call_args_list[0]
        data, import_date, state = method_call[0]
        kwargs = method_call[1]
        assert date.today() == import_date
        assert state == "PR"
        for entry, expected_entry in zip(
                data, rows.import_from_csv(self.file_data["file"])):
            assert entry._asdict() == expected_entry._asdict()
        assert kwargs["skip_sum_cases"] is True
        assert kwargs["skip_sum_deaths"] is True
Beispiel #19
0
def read_cases(input_filename, order_by=None):
    cases = rows.import_from_csv(input_filename,
                                 force_types=load_schema(
                                     str(SCHEMA_PATH / "caso.csv")))
    if order_by:
        cases.order_by(order_by)
    return cases
Beispiel #20
0
    def test_detect_dialect_using_json(self):
        temp = tempfile.NamedTemporaryFile(delete=False)
        filename = '{}.{}'.format(temp.name, self.file_extension)
        encoding = 'utf-8'
        self.files_to_delete.append(filename)

        # Using JSON will force the sniffer to do not include ':', '}' in the
        # possible delimiters
        table = rows.Table(fields=OrderedDict([
            ('jsoncolumn1', rows.fields.JSONField),
            ('jsoncolumn2', rows.fields.JSONField),
            ]))
        table.append({
            'jsoncolumn1': '{"a": 42}',
            'jsoncolumn2': '{"b": 43}',
            })
        table.append({
            'jsoncolumn1': '{"c": 44}',
            'jsoncolumn2': '{"d": 45}',
            })
        rows.export_to_csv(table, filename, encoding=encoding)

        table = rows.import_from_csv(filename, encoding=encoding)

        self.assertEqual(table.field_names, ['jsoncolumn1', 'jsoncolumn2'])
        self.assertDictEqual(table[0].jsoncolumn1, {'a': 42})
        self.assertDictEqual(table[0].jsoncolumn2, {'b': 43})
        self.assertDictEqual(table[1].jsoncolumn1, {'c': 44})
        self.assertDictEqual(table[1].jsoncolumn2, {'d': 45})
    def test_confirmed_cases_must_be_equal_or_greater_than_deaths(self):
        original_content = self.content

        self.content = original_content.replace("Abatiá,9,1", "Abatiá,9,20")
        file_rows = rows.import_from_csv(self.file_from_content)
        with pytest.raises(SpreadsheetValidationErrors):
            format_spreadsheet_rows_as_dict(file_rows, self.date, self.uf)
    def test_quotes(self):
        temp = tempfile.NamedTemporaryFile(delete=False)
        filename = '{}.{}'.format(temp.name, self.file_extension)
        self.files_to_delete.append(filename)

        table = rows.Table(fields=OrderedDict([
            ('field_1', rows.fields.TextField),
            ('field_2', rows.fields.TextField),
            ('field_3', rows.fields.TextField),
            ('field_4', rows.fields.TextField),
        ]))
        table.append({
            'field_1': '"quotes"',
            'field_2': 'test "quotes"',
            'field_3': '"quotes" test',
            'field_4': 'test "quotes" test',
        })
        # we need this line row since `"quotes"` on `field_1` could be
        # `JSONField` or `TextField`
        table.append({
            'field_1': 'noquotes',
            'field_2': 'test "quotes"',
            'field_3': '"quotes" test',
            'field_4': 'test "quotes" test',
        })
        rows.export_to_csv(table, filename)

        table2 = rows.import_from_csv(filename)
        self.assert_table_equal(table, table2)
Beispiel #23
0
    def test_quotes(self):
        temp = tempfile.NamedTemporaryFile(delete=False)
        filename = '{}.{}'.format(temp.name, self.file_extension)
        self.files_to_delete.append(filename)

        table = rows.Table(fields=OrderedDict([
                    ('field_1', rows.fields.TextField),
                    ('field_2', rows.fields.TextField),
                    ('field_3', rows.fields.TextField),
                    ('field_4', rows.fields.TextField), ]))
        table.append({
            'field_1': '"quotes"',
            'field_2': 'test "quotes"',
            'field_3': '"quotes" test',
            'field_4': 'test "quotes" test',
            })
        # we need this line row since `"quotes"` on `field_1` could be
        # `JSONField` or `TextField`
        table.append({
            'field_1': 'noquotes',
            'field_2': 'test "quotes"',
            'field_3': '"quotes" test',
            'field_4': 'test "quotes" test',
            })
        rows.export_to_csv(table, filename)

        table2 = rows.import_from_csv(filename)
        self.assert_table_equal(table, table2)
Beispiel #24
0
 def start_requests(self):
     links = rows.import_from_csv(settings.OUTPUT_PATH /
                                  "filiacao-links.csv")
     for row in links:
         yield scrapy.Request(url="file://" +
                              str(Path(row.filename).absolute()),
                              meta=row._asdict())
 def serialize_cases(buffer):
     for case in import_from_csv(buffer):
         case = case._asdict()
         data = {
             new_label: case.get(old_label)
             for old_label, new_label in CASE_LABELS
         }
Beispiel #26
0
 def parse(self, response):
     url_table = rows.import_from_csv(
         io.BytesIO(response.body), encoding="utf-8", force_types=HASH_FIELDS,
     )
     self.URLInfo = url_table.Row
     self.url_hashes = {row.url: row for row in url_table}
     yield scrapy.Request(URL_LIST_URL, callback=self.parse_url_list)
    def test_detect_dialect_using_json(self):
        temp = tempfile.NamedTemporaryFile(delete=False)
        filename = '{}.{}'.format(temp.name, self.file_extension)
        encoding = 'utf-8'
        self.files_to_delete.append(filename)

        # Using JSON will force the sniffer to do not include ':', '}' in the
        # possible delimiters
        table = rows.Table(fields=OrderedDict([
            ('jsoncolumn1', rows.fields.JSONField),
            ('jsoncolumn2', rows.fields.JSONField),
        ]))
        table.append({
            'jsoncolumn1': '{"a": 42}',
            'jsoncolumn2': '{"b": 43}',
        })
        table.append({
            'jsoncolumn1': '{"c": 44}',
            'jsoncolumn2': '{"d": 45}',
        })
        rows.export_to_csv(table, filename, encoding=encoding)

        table = rows.import_from_csv(filename, encoding=encoding)

        self.assertEqual(table.field_names, ['jsoncolumn1', 'jsoncolumn2'])
        self.assertDictEqual(table[0].jsoncolumn1, {'a': 42})
        self.assertDictEqual(table[0].jsoncolumn2, {'b': 43})
        self.assertDictEqual(table[1].jsoncolumn1, {'c': 44})
        self.assertDictEqual(table[1].jsoncolumn2, {'d': 45})
 def situacao_candidatura(self):
     return {(
         row.codigo_situacao_candidatura,
         row.situacao_candidatura,
     ): row.nova_situacao_candidatura
             for row in rows.import_from_csv(
                 settings.HEADERS_PATH / f"situacao-candidatura.csv", )}
Beispiel #29
0
    def test_export_to_csv_filename(self):
        # TODO: may test file contents
        temp = tempfile.NamedTemporaryFile(delete=False)
        self.files_to_delete.append(temp.name)
        rows.export_to_csv(utils.table, temp.name)

        table = rows.import_from_csv(temp.name)
        self.assert_table_equal(table, utils.table)
Beispiel #30
0
def read_epidemiological_week():
    filename = "data/epidemiological-week.csv"
    table = rows.import_from_csv(filename)
    return {
        row.date:
        int(f"{row.epidemiological_year}{row.epidemiological_week:02d}")
        for row in table
    }
Beispiel #31
0
 def parse(self, response):
     table = rows.import_from_csv(io.BytesIO(response.body), encoding="utf-8")
     for row in table:
         yield scrapy.Request(
             gdocs_xlsx_download_url(row.planilha_brasilio),
             meta={"state": row.uf},
             callback=self.parse_state_file,
         )
Beispiel #32
0
def get_cities():
    table = rows.import_from_csv(
        POPULATION_DATA_PATH, force_types=load_schema(str(POPULATION_SCHEMA_PATH)),
    )
    cities = defaultdict(dict)
    for row in table:
        cities[row.state][row.city] = row
    return cities
Beispiel #33
0
    def test_import_from_csv_fobj(self):
        # TODO: may test with codecs.open passing an encoding
        with open(self.filename) as fobj:
            table = rows.import_from_csv(fobj, encoding=self.encoding)
        self.assert_table_equal(table, utils.table)

        expected_meta = {'imported_from': 'csv', 'filename': self.filename,}
        self.assertEqual(table.meta, expected_meta)
Beispiel #34
0
def get_actions_for_state(state):
    url = "https://docs.google.com/spreadsheets/d/1epxFffymqv1t2s37rQ-p5eKpvecOIBzCfJPLI53wYTY/export?format=csv&id=1epxFffymqv1t2s37rQ-p5eKpvecOIBzCfJPLI53wYTY&gid=1565988556"
    response = requests.get(url)
    table = rows.import_from_csv(io.BytesIO(response.content), encoding="utf-8")
    return [
        Action(year=row.ano, state=row.estado, name=row.nome_acao, code=row.codigo_acao)
        for row in table
        if row.estado == state and all((row.ano, row.estado, row.codigo_acao, row.nome_acao))
    ]
Beispiel #35
0
def read_epidemiological_week():
    # TODO: use pkg_resources to get correct path
    filename = "covid19br/data/epidemiological-week.csv"
    table = rows.import_from_csv(filename)
    return {
        row.date:
        int(f"{row.epidemiological_year}{row.epidemiological_week:02d}")
        for row in table
    }
    def test_validate_if_all_cities_exists_are_in_the_state(self):
        file_rows = rows.import_from_csv(self.file_from_content)

        with pytest.raises(SpreadsheetValidationErrors) as execinfo:
            format_spreadsheet_rows_as_dict(file_rows, self.date, "SP")

        exception = execinfo.value
        assert "Abatiá não pertence à UF SP" in exception.error_messages
        assert "Adrianópolis não pertence à UF SP" in exception.error_messages
Beispiel #37
0
 def test_real_data_2(self):
     filename = "tests/data/milho-safra-2017"
     result = rows.import_from_pdf(
         filename + ".pdf",
         backend=self.backend,
         starts_after=re.compile("MILHO SAFRA 16/17: ACOMPANHAMENTO DE .*"),
         ends_before="*Variação em pontos percentuais.",
     )
     expected = rows.import_from_csv(filename + ".csv")
     self.assertEqual(list(expected), list(result))
Beispiel #38
0
    def test_import_from_csv_uses_create_table(self, mocked_create_table):
        mocked_create_table.return_value = 42
        kwargs = {'encoding': 'utf-8', 'some_key': 123, 'other': 456, }
        result = rows.import_from_csv(self.filename, **kwargs)
        self.assertTrue(mocked_create_table.called)
        self.assertEqual(mocked_create_table.call_count, 1)
        self.assertEqual(result, 42)

        call = mocked_create_table.call_args
        kwargs['meta'] = {'imported_from': 'csv', 'filename': self.filename, }
        self.assertEqual(call[1], kwargs)
Beispiel #39
0
 def test_rects_boundaries(self):
     filename = "tests/data/ibama-autuacao-amazonas-2010-pag2"
     result = rows.import_from_pdf(
         filename + ".pdf",
         backend=self.backend,
         starts_after=re.compile("DIRETORIA DE PROTE.*"),
         ends_before=re.compile("Pag [0-9]+/[0-9]+"),
         algorithm="rects-boundaries",
     )
     expected = rows.import_from_csv(filename + ".csv")
     self.assertEqual(list(expected), list(result))
Beispiel #40
0
    def test_issue_168(self):
        temp = tempfile.NamedTemporaryFile(delete=False)
        filename = "{}.{}".format(temp.name, self.file_extension)
        self.files_to_delete.append(filename)

        table = rows.Table(fields=OrderedDict([("jsoncolumn", rows.fields.JSONField)]))
        table.append({"jsoncolumn": '{"python": 42}'})
        rows.export_to_csv(table, filename)

        table2 = rows.import_from_csv(filename)
        self.assert_table_equal(table, table2)
Beispiel #41
0
    def test_import_field_limit(self):
        temp = tempfile.NamedTemporaryFile(delete=False)
        filename = "{}.{}".format(temp.name, self.file_extension)
        self.files_to_delete.append(filename)

        table = rows.import_from_dicts([{"f1": "a" * 132000}])
        rows.export_to_csv(table, filename)

        # The following line must not raise the exception:
        # `_csv.Error: field larger than field limit (131072)`
        new = rows.import_from_csv(filename)
Beispiel #42
0
    def test_export_to_csv_filename(self):
        # TODO: may test file contents
        temp = tempfile.NamedTemporaryFile(delete=False)
        self.files_to_delete.append(temp.name)
        rows.export_to_csv(utils.table, temp.name)

        table = rows.import_from_csv(temp.name)
        self.assert_table_equal(table, utils.table)

        temp.file.seek(0)
        result = temp.file.read()
        export_in_memory = rows.export_to_csv(utils.table, None)
        self.assertEqual(result, export_in_memory)
Beispiel #43
0
    def setUp(self):
        rows_xpath = '//*[@class="entry-container"]/*[@class="row-fluid"]/*[@class="span6"]'
        fields_xpath = OrderedDict([
                ('url', './/h2/a/@href'),
                ('name', './/h2/a/text()'),
                ('address', './/div[@class="spField field_direccion"]/text()'),
                ('phone', './/div[@class="spField field_telefono"]/text()'),
                ('website', './/div[@class="spField field_sitio_web"]/text()'),
                ('email', './/div[@class="spField field_email"]/text()'), ])
        self.kwargs = {'rows_xpath': rows_xpath,
                       'fields_xpath': fields_xpath, }

        self.expected_table = rows.import_from_csv(self.expected_data)
        self.files_to_delete = []
Beispiel #44
0
    def test_import_from_csv_uses_create_table(self, mocked_create_table):
        mocked_create_table.return_value = 42
        kwargs = {"some_key": 123, "other": 456}
        result = rows.import_from_csv(self.filename, encoding="utf-8", **kwargs)
        self.assertTrue(mocked_create_table.called)
        self.assertEqual(mocked_create_table.call_count, 1)
        self.assertEqual(result, 42)

        call = mocked_create_table.call_args
        kwargs["meta"] = {
            "imported_from": "csv",
            "filename": self.filename,
            "encoding": "utf-8",
        }
        self.assertEqual(call[1], kwargs)
Beispiel #45
0
    def test_import_from_xpath_filename(self):
        table = rows.import_from_xpath(self.filename,
                                       encoding=self.encoding,
                                       **self.kwargs)

        expected_meta = {'imported_from': 'xpath', 'filename': self.filename,}
        self.assertEqual(table.meta, expected_meta)

        temp = tempfile.NamedTemporaryFile(delete=False)
        self.files_to_delete.append(temp.name)
        fobj = temp.file
        rows.export_to_csv(table, fobj)
        fobj.seek(0)
        table = rows.import_from_csv(fobj)

        self.assert_table_equal(table, self.expected_table)
def import_data(data_dir):
    for f in csv_files(data_dir):
        csv = rows.import_from_csv(str(f.realpath()))
        logging.warning('Import file:' + str(f.realpath()))
        csv = csv[::-1]
        s = sessions_history(csv[0])
        s['prev_close'] = csv[0].open
        sh = Sessions_History()
        sh.set(**s)
        commit()
        size = len(csv)
        for i in range(1, size):
            s = sessions_history(csv[i])
            s['prev_close'] = csv[i - 1].close
            sh = Sessions_History()
            sh.set(**s)
            commit()
Beispiel #47
0
    def setUp(self):
        rows_xpath = (
            '//*[@class="entry-container"]/*[@class="row-fluid"]/*[@class="span6"]'
        )
        fields_xpath = OrderedDict(
            [
                ("url", ".//h2/a/@href"),
                ("name", ".//h2/a/text()"),
                ("address", './/div[@class="spField field_direccion"]/text()'),
                ("phone", './/div[@class="spField field_telefono"]/text()'),
                ("website", './/div[@class="spField field_sitio_web"]/text()'),
                ("email", './/div[@class="spField field_email"]/text()'),
            ]
        )
        self.kwargs = {"rows_xpath": rows_xpath, "fields_xpath": fields_xpath}

        self.expected_table = rows.import_from_csv(self.expected_data)
        self.files_to_delete = []
Beispiel #48
0
    def test_import_from_csv_discover_dialect_decode_error(self):

        # Create a 1024-bytes line (if encoded to ASCII, UTF-8 etc.)
        line = '"' + ("a" * 508) + '", "' + ("b" * 508) + '"\r\n'
        lines = 256 * line  # 256KiB

        # Now change the last byte (in the 256KiB sample) to have half of a
        # character representation (when encoded to UTF-8)
        data = lines[:-3] + '++Á"\r\n'
        data = data.encode("utf-8")

        # Should not raise `UnicodeDecodeError`
        table = rows.import_from_csv(
            BytesIO(data), encoding="utf-8", sample_size=262144
        )

        last_row = table[-1]
        last_column = "b" * 508
        self.assertEqual(getattr(last_row, last_column), "b" * 508 + "++Á")
    def run(self):
        google_username = "******"
        google_password = "******"
        path = "/home/vagner/workspace/ITSGoogleTrends/output/"

        # connect to Google
        try:
            self.show("Realizando conexão com usuário " + google_username)
            connector = pyGTrends(google_username, google_password)
            self.show("Conexão realizada com sucesso")
        except Exception as e:
            raise ITSGoogleTrendsError ("Erro durante a conexão com o Google.")
        #montando a string de requisicao

        # Lendo os dados do arquivo csv
        table = rows.import_from_csv(self._CSV_FILE_PATH + self._CSV_FILE_NAME)

        rows_number = len(table)
        index = 0

        for row in table:
            its_name = str(row.system).lower()
            index = index + 1
            self.show("Início da busca dos dados de tendência para o ITS: {0:s} [{1:d}/{2:d}]".format(its_name,index,rows_number))
            str_request =  self._DEFAUT_KEYWORD + "," + its_name

            self.show("Realizando uma requisição com a sentença " + str_request)
            # make request
            connector.request_report(str_request)

            # download file
            self._now = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))
            csv_file_name = "{0:s}-{1:s}".format(self._now, str_request.replace(",", "-").replace(" ","-"))
            connector.save_csv(path, csv_file_name)
            self.show("Resultados escritos no arquivos {0:s}.csv".format(csv_file_name))

            # wait a random amount of time between requests to avoid bot detection
            wait_time = randint(5, 10)
            self.show("Aguardando {0:d} segundos para uma nova requisição".format(wait_time))
            time.sleep(wait_time)
        #end for
        self.show("Execução realizada com sucesso!")
Beispiel #50
0
    def test_import_from_xpath_fobj(self):
        # TODO: may test with codecs.open passing an encoding
        with open(self.filename, mode='rb') as fobj:
            table = rows.import_from_xpath(fobj,
                                           encoding=self.encoding,
                                           **self.kwargs)

        expected_meta = {'imported_from': 'xpath',
                         'filename': self.filename,
                         'encoding': self.encoding, }
        self.assertEqual(table.meta, expected_meta)

        temp = tempfile.NamedTemporaryFile(delete=False)
        self.files_to_delete.append(temp.name)
        fobj = temp.file
        rows.export_to_csv(table, fobj)
        fobj.seek(0)
        table = rows.import_from_csv(fobj)

        self.assert_table_equal(table, self.expected_table)
Beispiel #51
0
    def test_detect_dialect_more_data(self):
        temp = tempfile.NamedTemporaryFile(delete=False)
        filename = '{}.{}'.format(temp.name, self.file_extension)
        self.files_to_delete.append(filename)

        # If the sniffer reads only the first line, it will think the delimiter
        # is ',' instead of ';'
        data = textwrap.dedent('''
            field1,samefield;field2,other
            row1value1;row1value2
            row2value1;row2value2
            ''').strip()
        with open(filename, 'wb') as fobj:
            fobj.write(data.encode('utf-8'))

        table = rows.import_from_csv(filename, encoding='utf-8')
        self.assertEqual(table.field_names, ['field1samefield', 'field2other'])
        self.assertEqual(table[0].field1samefield, 'row1value1')
        self.assertEqual(table[0].field2other, 'row1value2')
        self.assertEqual(table[1].field1samefield, 'row2value1')
        self.assertEqual(table[1].field2other, 'row2value2')
# coding: utf-8

from __future__ import division
from collections import OrderedDict
import rows
import re

table = rows.import_from_csv('../0cleaning/clean_data2.csv')

quantidade_de_exemplos = len(table)
statistics = {}
for row in table:
    drow = row.__dict__
    
    for key, value in drow.items():
        if key not in statistics:
            statistics[key] = {}
        
        if value not in statistics[key]:
            statistics[key][value] = 0
        statistics[key][value] += 1

string = rows.fields.UnicodeField
table_output = rows.Table(fields=OrderedDict({'column': string, 'value': string, 'amount': string, 'percent': string}))
for key in statistics.keys():
    for value in statistics[key].keys():
        table_output.append({
            'column': key,
            'value': value,
            'amount': statistics[key][value],
            'percent': "{0:.2f}".format(statistics[key][value] / quantidade_de_exemplos * 100)
 def table(self):
     """Returns the table with all data in rows format."""
     file_data = self._file_data()
     data = StringIO.StringIO(file_data)
     table = rows.import_from_csv(data)
     return table
Beispiel #54
0
# This example was based on:
# https://github.com/compjour/search-script-scrape/blob/master/scripts/101.py

from io import BytesIO

import requests

import rows

# Capture
url = "http://unitedstates.sunlightfoundation.com/legislators/legislators.csv"
csv = BytesIO(requests.get(url).content)

# Normalize
table = rows.import_from_csv(csv)

# Analyze
total = len(table)
total_in_office = sum(1 for row in table if row.in_office)
men = sum(1 for row in table if row.gender == "M")
men_in_office = sum(1 for row in table if row.gender == "M" and row.in_office)
women = sum(1 for row in table if row.gender == "F")
women_in_office = sum(1 for row in table if row.gender == "F" and row.in_office)

# View
print(
    "  Men: {}/{} ({:02.2f}%), in office: {}/{} ({:02.2f}%)".format(
        men,
        total,
        100 * men / float(total),
Beispiel #55
0
    def update_data_for_year(self, year):
        self.debug(u'Updating data for year {0}'.format(year))

        try:
            csv_data = self.retrieve_data_for_year(year).replace('\r\n', '\n')
        except Exception:
            print u'Not found data for year {0}'.format(year)
            return

        # Skip first line
        head, tail = csv_data.split('\n', 1)
        self.debug(u'Reading file...')
        data = rows.import_from_csv(BytesIO(tail.encode('utf-8')))

        if not data:
            self.debug(u'Error downloading file for year {0}'.format(year))
            return

        expected_header = [
            u'ano',
            u'mes',
            u'senador',
            u'tipo_despesa',
            u'cnpj_cpf',
            u'fornecedor',
            u'documento',
            u'data',
            u'detalhamento',
            u'valor_reembolsado',
        ]

        actual_header = data.fields.keys()

        if actual_header != expected_header:
            # FIXME
            print u'Bad CSV: expected header {0}, got {1}'.format(
                expected_header, actual_header
            )
            return

        archived_expense_list = []
        objects_counter = 0
        archived_expense_list_counter = len(data)

        legislators = {}
        mandates = {}
        natures = {}

        for row in data:
            if not row.senador:
                self.debug(u'Error downloading file for year {0}')
                continue

            if not row.data:
                date = '01/{0}/{1}'.format(row.mes, row.ano)
                expense_date = datetime.strptime(date, '%d/%m/%Y')
            else:
                expense_date = datetime.strptime(row.data, '%d/%m/%Y')

            name = self._normalize_name(row.senador)
            nature = row.tipo_despesa
            cpf_cnpj = row.cnpj_cpf
            supplier_name = row.fornecedor
            docnumber = row.documento
            expensed = row.valor_reembolsado

            # FIXME: WTF?
            if isinstance(expensed, unicode):
                expensed = float(expensed.replace(',', '.').replace('\r', '').replace('\n', ''))

            # memory cache
            expense_nature = natures.get(nature)
            if not expense_nature:
                expense_nature, _ = ExpenseNature.objects.get_or_create(name=nature)
                natures[nature] = expense_nature

            supplier = self.get_or_create_supplier(cpf_cnpj, supplier_name)

            # memory cache
            legislator = legislators.get(name)
            if not legislator:
                legislator = self._get_or_create_legislator(name)
                legislators[name] = legislator

            # memory cache
            mandate = mandates.get(name)
            if not mandate:
                mandate = self.mandate_for_legislator(legislator, None)
                mandates[name] = mandate

            expense = ArchivedExpense(
                number=docnumber,
                nature=expense_nature,
                date=expense_date,
                expensed=expensed,
                mandate=mandate,
                supplier=supplier,
                collection_run=self.collection_run
            )
            archived_expense_list.append(expense)
            self.debug(u'New expense found: {0}'.format(unicode(expense)))

            objects_counter += 1
            archived_expense_list_counter -= 1

            # We create a list with up to OBJECT_LIST_MAXIMUM_COUNTER.
            # If that lists is equal to the maximum object count allowed
            # or if there are no more objects in archived_expense_list,
            # we bulk_create() them and clear the list.

            if objects_counter == OBJECT_LIST_MAXIMUM_COUNTER or archived_expense_list_counter == 0:
                ArchivedExpense.objects.bulk_create(archived_expense_list)
                archived_expense_list[:] = []
                objects_counter = 0
                reset_queries()
Beispiel #56
0
import rows
from io import BytesIO

print '------\nReading file\n-----'
filename = 'data/PERM_Disclosure_Data_FY16_Q2_shorter.csv' # shorter csv
f = open(filename,'r')
filedata = f.read()
f.close()

filedata = filedata.replace('2007_NAICS_US_CODE', 'COLUMN_2007_NAICS_US_CODE')
filedata = filedata.replace('2007_NAICS_US_TITLE', 'COLUMN_2007_NAICS_US_TITLE')
visas_data = rows.import_from_csv(BytesIO(filedata))

print 'Hey, rows automatically identified the types:'
for field_name, field_type in visas_data.fields.items():
    print '{} is {}'.format(field_name, field_type)

print '------\nStart analysis\n-----'
certified = filter(lambda row: row.case_status == 'Certified',  visas_data)
denied = filter(lambda row: row.case_status == 'Denied', visas_data)
print 'Certified vs Denied: {} vs {}'.format(len(certified), len(denied))

developer_code = "15-1133"
developers = filter(lambda row: row.pw_soc_code == developer_code,  visas_data)
non_developers = filter(lambda row: row.pw_soc_code != developer_code,  visas_data)
print 'Devs vs Non-Devs: {} vs {}'.format(len(developers), len(non_developers))

developers_certified = filter(lambda row: row.case_status == 'Certified'
                                and row.pw_soc_code == developer_code,  visas_data)
developers_denied = filter(lambda row: row.case_status == 'Denied'
                                and row.pw_soc_code != developer_code,  visas_data)
Beispiel #57
0
 def test_real_data_1(self):
     filename = "tests/data/balneabilidade-26-2010"
     result = rows.import_from_pdf(filename + ".pdf", backend=self.backend)
     expected = rows.import_from_csv(filename + ".csv")
     self.assertEqual(list(expected), list(result))